2
|
1 package pagerank;
|
|
2
|
|
3
|
|
4 import java.io.File;
|
|
5 import java.io.FileInputStream;
|
|
6 import java.io.FileNotFoundException;
|
|
7 import java.io.FileOutputStream;
|
|
8 import java.io.IOException;
|
|
9 import java.util.HashMap;
|
|
10 import java.util.HashSet;
|
|
11 import java.util.Map;
|
|
12
|
|
13 import javax.xml.parsers.ParserConfigurationException;
|
|
14 import javax.xml.parsers.SAXParser;
|
|
15 import javax.xml.parsers.SAXParserFactory;
|
|
16
|
|
17 import org.xml.sax.SAXException;
|
|
18
|
|
19 import com.tinkerpop.blueprints.Graph;
|
|
20 import com.tinkerpop.blueprints.Vertex;
|
|
21 import com.tinkerpop.blueprints.impls.tg.TinkerGraph;
|
|
22 import com.tinkerpop.blueprints.util.io.graphml.GraphMLWriter;
|
|
23
|
|
24 import xmlParser.TextTagParser;
|
|
25
|
|
26 public class LinkConvertGraph {
|
14
|
27
|
|
28
|
|
29
|
|
30
|
|
31 public static void main(String[] args) {
|
|
32 // final String filename = "./resources/article.xml";
|
|
33 final String filename = "/Users/aotokage/testProgram/wiki/ja-pages_current.xml";
|
|
34 // final String fileDB = "./resources/tinkerpopDB";
|
|
35
|
|
36
|
16
|
37 final long PAGENUM = 200000;
|
14
|
38 final String fileDB = "./resources/tinkerGraph"+ Long.toString(PAGENUM);
|
|
39 final String logFile = "./resources/wikiLink"+Long.toString(PAGENUM)+".log";
|
|
40
|
|
41 LinkConvertGraph lcg;
|
|
42
|
|
43 try {
|
|
44 lcg = new LinkConvertGraph(filename);
|
|
45 lcg.parseXml();
|
|
46 // lcg.printHash();
|
|
47
|
|
48 HashMap<String,HashSet<String>> hash = lcg.getHash();
|
|
49
|
|
50 Graph graph = new TinkerGraph();
|
|
51 LinkToVertex ltv = new LinkToVertex(graph);
|
|
52
|
|
53 FileOutputStream fos = new FileOutputStream(logFile);
|
|
54 long countId = 0;
|
16
|
55
|
14
|
56 for (Map.Entry<String, HashSet<String>> map : hash.entrySet()) {
|
|
57 lcg.printLinkLog(map, fos);
|
16
|
58 if (PAGENUM <= countId) break;
|
14
|
59 String pageTitle = map.getKey();
|
|
60 Vertex v;// = graph.addVertex(null);
|
|
61 if ( ltv.getId(pageTitle) == null ) {
|
15
|
62 v = ltv.createVertexWithPageTitle(pageTitle,countId);
|
14
|
63 ltv.setPageRank(v, (Double)0.0);
|
|
64 countId++;
|
16
|
65 if (PAGENUM <= countId) break;
|
14
|
66 } else {
|
|
67 v = ltv.getVertex(pageTitle);
|
|
68 }
|
|
69
|
|
70 for (String linkPageTitle : map.getValue()) {
|
|
71 Vertex linkV;
|
|
72 if ( ltv.getId(linkPageTitle) == null) {
|
15
|
73 linkV = ltv.createVertexWithPageTitle(linkPageTitle,countId);
|
14
|
74 countId++;
|
|
75 ltv.setPageRank(linkV, (Double)0.0);
|
|
76 } else {
|
|
77 linkV = ltv.getVertex(linkPageTitle);
|
|
78 }
|
|
79 ltv.setHasLink(v, linkV);
|
16
|
80 if (PAGENUM <= countId) break;
|
14
|
81 }
|
|
82
|
|
83 }
|
|
84 System.out.println("countId = "+countId);
|
|
85
|
|
86 FileOutputStream out = new FileOutputStream(new File(fileDB));
|
|
87 GraphMLWriter.outputGraph(graph, out);
|
|
88
|
|
89 } catch (FileNotFoundException e) {
|
|
90 e.printStackTrace();
|
|
91 } catch (SAXException e) {
|
|
92 e.printStackTrace();
|
|
93 } catch (ParserConfigurationException e) {
|
|
94 e.printStackTrace();
|
|
95 } catch (IOException e) {
|
|
96 System.err.println("Failed to parse xml");
|
|
97 e.printStackTrace();
|
|
98 }
|
|
99
|
|
100
|
|
101
|
|
102 }
|
2
|
103
|
|
104 private String filename;
|
|
105 private FileInputStream fis;
|
|
106 private SAXParserFactory factory;
|
|
107 private SAXParser parser;
|
|
108 private TextTagParser xmlParser;
|
|
109
|
|
110 private HashMap<String,HashSet<String>> hash;
|
|
111
|
|
112
|
|
113 LinkConvertGraph() throws ParserConfigurationException, SAXException {
|
|
114 xmlParser = new TextTagParser();
|
|
115 factory = SAXParserFactory.newInstance();
|
|
116 parser = factory.newSAXParser();
|
|
117 }
|
|
118
|
|
119 LinkConvertGraph(final String filename) throws FileNotFoundException, ParserConfigurationException, SAXException {
|
|
120 this.filename = filename;
|
|
121 fis = new FileInputStream(filename);
|
|
122 xmlParser = new TextTagParser();
|
|
123 factory = SAXParserFactory.newInstance();
|
|
124 parser = factory.newSAXParser();
|
|
125 }
|
|
126
|
|
127 public void setFilename(final String filename) throws FileNotFoundException {
|
|
128 this.filename = filename;
|
|
129 this.fis = new FileInputStream(filename);
|
|
130 }
|
|
131
|
|
132 private void parseXml() throws SAXException, IOException {
|
|
133 parser.parse(this.fis, this.xmlParser);
|
|
134 hash = xmlParser.getHash();
|
|
135 }
|
|
136
|
|
137 private HashMap<String,HashSet<String>> getHash() {
|
|
138 return hash;
|
|
139 }
|
|
140
|
|
141 public void printHash() {
|
|
142 for (Map.Entry<String,HashSet<String>> entry : hash.entrySet()) {
|
|
143 String title = entry.getKey();
|
|
144 System.out.println("title: " + title);
|
|
145 for (String link : entry.getValue()) {
|
|
146 System.out.println("\t"+link);
|
|
147 }
|
|
148 System.out.println();
|
|
149 }
|
|
150 }
|
|
151
|
|
152 private void printHash(FileOutputStream os) throws IOException {
|
|
153 for (Map.Entry<String,HashSet<String>> entry : hash.entrySet()) {
|
|
154 String title = entry.getKey();
|
|
155 os.write( ("title: " + title + "\n").getBytes());
|
|
156 for (String link : entry.getValue()) {
|
|
157 os.write( ("\t"+link+"\n").getBytes());
|
|
158 }
|
|
159 os.write( ("\n").getBytes());
|
|
160 os.flush();
|
|
161 }
|
|
162 }
|
|
163
|
13
|
164 void printLinkLog(Map.Entry<String, HashSet<String>> map, FileOutputStream os) throws IOException {
|
|
165 String title = map.getKey();
|
|
166 os.write( ("title: " + title + "\n").getBytes());
|
|
167 for (String link : map.getValue()) {
|
|
168 os.write( ("\t"+link+"\n").getBytes());
|
|
169 }
|
|
170 os.write( ("\n").getBytes());
|
|
171 os.flush();
|
|
172 }
|
2
|
173
|
|
174
|
|
175 }
|