Mercurial > hg > Members > nobuyasu > TPPageRank
changeset 2:1744340f8be6 draft
add some java files
author | one |
---|---|
date | Wed, 05 Sep 2012 11:56:21 +0900 |
parents | 08f01b5c4d4a |
children | b44abb9aa09f |
files | src/pagerank/LinkConvertGraph.java src/pagerank/LinkToVertex.java src/pagerank/WikiPage.java src/sample/CreateTinkerGraph.java src/xmlParser/CharReader.java src/xmlParser/TextTagParser.java src/xmlParser/WikiLinkParser.java src/xmlParser/XmlTagObject.java |
diffstat | 8 files changed, 790 insertions(+), 24 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/pagerank/LinkConvertGraph.java Wed Sep 05 11:56:21 2012 +0900 @@ -0,0 +1,158 @@ +package pagerank; + + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; + +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import org.xml.sax.SAXException; + +import com.tinkerpop.blueprints.Graph; +import com.tinkerpop.blueprints.Vertex; +import com.tinkerpop.blueprints.impls.tg.TinkerGraph; +import com.tinkerpop.blueprints.util.io.graphml.GraphMLWriter; + +import xmlParser.TextTagParser; + +public class LinkConvertGraph { + + private String filename; + private FileInputStream fis; + private SAXParserFactory factory; + private SAXParser parser; + private TextTagParser xmlParser; + + private HashMap<String,HashSet<String>> hash; + + + LinkConvertGraph() throws ParserConfigurationException, SAXException { + xmlParser = new TextTagParser(); + factory = SAXParserFactory.newInstance(); + parser = factory.newSAXParser(); + } + + LinkConvertGraph(final String filename) throws FileNotFoundException, ParserConfigurationException, SAXException { + this.filename = filename; + fis = new FileInputStream(filename); + xmlParser = new TextTagParser(); + factory = SAXParserFactory.newInstance(); + parser = factory.newSAXParser(); + } + + public void setFilename(final String filename) throws FileNotFoundException { + this.filename = filename; + this.fis = new FileInputStream(filename); + } + + private void parseXml() throws SAXException, IOException { + parser.parse(this.fis, this.xmlParser); + hash = xmlParser.getHash(); + } + + private HashMap<String,HashSet<String>> getHash() { + return hash; + } + + public void printHash() { + for (Map.Entry<String,HashSet<String>> entry : hash.entrySet()) { + String title = entry.getKey(); + System.out.println("title: " + title); + for (String link : entry.getValue()) { + System.out.println("\t"+link); + } + System.out.println(); + } + } + + private void printHash(FileOutputStream os) throws IOException { + for (Map.Entry<String,HashSet<String>> entry : hash.entrySet()) { + String title = entry.getKey(); + os.write( ("title: " + title + "\n").getBytes()); + for (String link : entry.getValue()) { + os.write( ("\t"+link+"\n").getBytes()); + } + os.write( ("\n").getBytes()); + os.flush(); + } + } + + + + + + public static void main(String[] args) { +// final String filename = "./resource/article.xml"; + final String filename = "/Users/aotokage/testProgram/wiki/ja-pages_current.xml"; + + LinkConvertGraph lcg; + + try { + + lcg = new LinkConvertGraph(filename); + + lcg.parseXml(); +// lcg.printHash(); + + FileOutputStream fos = new FileOutputStream("./resource/wikiLink.log"); + lcg.printHash(fos); + + HashMap<String,HashSet<String>> hash = lcg.getHash(); + + + final String filenameD = "./resource/tinkerpopDB"; + + Graph graph = new TinkerGraph(); + FileOutputStream out = new FileOutputStream(new File(filename)); + LinkToVertex ltn = new LinkToVertex(graph); + + for (Map.Entry<String, HashSet<String>> map : hash.entrySet()) { + String pageTitle = map.getKey(); + + Vertex v;// = graph.addVertex(null); + + if ( ltn.getId(pageTitle) == null ) { + v = ltn.createVertexWithPageTitle(pageTitle); + + } else { + v = ltn.getVertex(pageTitle); + } + + for (String linkPageTitle : map.getValue()) { + Vertex linkV; + if ( ltn.getId(linkPageTitle) == null) { + linkV = ltn.createVertexWithPageTitle(linkPageTitle); + ltn.setPageRank(linkV, (Double)0.0); + } else { + linkV = ltn.getVertex(linkPageTitle); + } + ltn.setHasLink(v, linkV); + } + + } + + GraphMLWriter.outputGraph(graph, out); + + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (SAXException e) { + e.printStackTrace(); + } catch (ParserConfigurationException e) { + e.printStackTrace(); + } catch (IOException e) { + System.err.println("Failed to parse xml"); + e.printStackTrace(); + } + + + + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/pagerank/LinkToVertex.java Wed Sep 05 11:56:21 2012 +0900 @@ -0,0 +1,231 @@ +package pagerank; + +import java.util.HashMap; + +import com.tinkerpop.blueprints.Direction; +import com.tinkerpop.blueprints.Edge; +import com.tinkerpop.blueprints.Graph; +import com.tinkerpop.blueprints.Vertex; + +import pagerank.WikiPage; + +public class LinkToVertex { + + Graph graph; + public final static String PAGE_TITLE = "pageTitle"; + public final static String PAGE_RANK = "pageRank"; + private HashMap<String, Long> pageIdTable = new HashMap<String, Long>(); + + private HashMap<String, WikiPage> wikiPageHash = new HashMap<String, WikiPage>(); + private long AllNodeNumber; + + private final double weight1 = 0.85; + private final double weight2 = 0.15; + + public static final String HAS_LINK = "HasLink"; + + LinkToVertex(Graph graph) { + this.graph = graph; + AllNodeNumber = 0; + } + + Long getId(String pageTitle) { + return pageIdTable.get(pageTitle); + } + + boolean isHasLink(String label) { + return label.equals(HAS_LINK); + } + + private Vertex createVertex() { + return graph.addVertex(null); + } + + private Vertex createVertex(Object id) { + return graph.addVertex(id); + } + + private Vertex createVertexWithProperty(String key, Object value) { + Vertex v = graph.addVertex(null); + v.setProperty(key,value); + return v; + } + + String getPageTitle(Vertex v) { + return (String) v.getProperty(PAGE_TITLE); + } + + Double getPageRank(Vertex v) { + return (Double) v.getProperty(PAGE_RANK); + } + + Vertex createVertexWithPageTitle(String pageTitle) { + Vertex v = createVertexWithProperty(PAGE_TITLE, pageTitle); + pageIdTable.put(pageTitle, (Long) v.getId()); + return v; + } + + Vertex setPageRank(Vertex v, Double rank) { + v.setProperty(PAGE_RANK, rank); + return v; + } + + Vertex getVertex(String name) { + long id = pageIdTable.get(name); + return graph.getVertex(id); + } + + Vertex getNode(int nodeId) { + return graph.getVertex(nodeId); + } + + Edge setRelationship(Vertex v1, Vertex v2, String label) { + Edge e = graph.addEdge(null, v1, v2, label); + return e; + } + + Edge setHasLink(Vertex v1, Vertex v2) { + return setRelationship(v1, v2, HAS_LINK); + } + + long searchAllNodes() { + AllNodeNumber = 0; + for (Vertex v : graph.getVertices()) { + if ( (v.getProperty(PAGE_TITLE) != null) && + (v.getProperty(PAGE_RANK)) != null ) { + WikiPage wiki = new WikiPage(v); + pageIdTable.put((String) v.getProperty(PAGE_TITLE), (Long) v.getId()); + wiki.setInHasLink(computeInHasLink(v)); + wiki.setOutHasLink(computeOutHasLink(v)); + wikiPageHash.put((String) v.getProperty(PAGE_TITLE), wiki); + AllNodeNumber++; + } + } + return AllNodeNumber; + } + + void searchRegiNodes(Vertex v) { + + if ( (v.getProperty(PAGE_TITLE) != null) && + (v.getProperty(PAGE_RANK) != null)) { + WikiPage wiki = new WikiPage(v); + pageIdTable.put((String) v.getProperty(PAGE_TITLE), (Long) v.getId()); + wiki.setInHasLink(computeInHasLink(v)); + wiki.setOutHasLink(computeOutHasLink(v)); + wikiPageHash.put((String) v.getProperty(PAGE_TITLE), wiki); + AllNodeNumber++; + } + } + + HashMap<String, WikiPage> getWikiPageHash() { + return wikiPageHash; + } + + HashMap<String, Long> getPageIdTable() { + return pageIdTable; + } + + public Iterable<Vertex> getAllNodes() { + return graph.getVertices(); + } + + public void printAllNodes() { + for (Vertex v : graph.getVertices() ) { + System.out.println("ID = "+ v.getId()); + for (String key: v.getPropertyKeys()) { + System.out.println(key + "=" + v.getProperty(key)); + } +/* + for (Edge e : v.getEdges(Direction.IN, HAS_LINK) ) { + System.out.println(); + } +*/ + } + System.out.println("--"); + + } + + public long computeOutHasLink(Vertex v) { + long count = 0; + for (Edge edge : v.getEdges(Direction.OUT, HAS_LINK)) { + count++; + } + return count; + } + + public long computeInHasLink(Vertex v) { + long count = 0; + for (Edge edge : v.getEdges(Direction.IN, HAS_LINK)) { + count++; + } + return count; + } + + public void printOutHasLink(Vertex v, int depth) { + int numberOfLinkPages = 0; + String output = v.getProperty(PAGE_TITLE) + " outHasLink pages:"; + System.out.println(output); + for (Edge edge : v.getEdges(Direction.OUT, HAS_LINK)) { + Vertex outV = edge.getVertex(Direction.IN); + String str = (String) outV.getProperty(PAGE_TITLE); + System.out.println(str); + numberOfLinkPages++; + } + String numOutput = "Number of outHaslink pages: " + numberOfLinkPages; + System.out.println(numOutput); + } + + public void printInHasLink(Vertex v, int depth) { + int numberOfLinkPages = 0; + String output = v.getProperty(PAGE_TITLE) + " inHasLink pages:"; + System.out.println(output); + for (Edge edge : v.getEdges(Direction.IN, HAS_LINK)) { + Vertex outV = edge.getVertex(Direction.OUT); + String str = (String) outV.getProperty(PAGE_TITLE); + System.out.println(str); + numberOfLinkPages++; + } + String numOutput = "Number of inHaslink pages: " + numberOfLinkPages + "\n"; + System.out.println(numOutput); + } + + public double computePageRank(Vertex v) { + double sum = 0; + double pageRank = 0; + String title = getPageTitle(v); + WikiPage wiki = wikiPageHash.get(title); + + for (Edge edge : v.getEdges(Direction.IN, HAS_LINK) ) { + Vertex linkV = edge.getVertex(Direction.OUT); + sum += (double) ((Double) linkV.getProperty(PAGE_RANK)) / computeInHasLink(linkV) ; + } + + if (computeOutHasLink(v) == 0) { + pageRank = (double) sum * weight1 + + (double) ((double) 1 / AllNodeNumber * weight2); + } else { + pageRank = (double) ((double)sum / computeOutHasLink(v) * weight1) + + (double) ((double) 1 / AllNodeNumber * weight2); + } + wiki.setRank(pageRank); + v.setProperty(PAGE_RANK, pageRank); + return pageRank; + } + + public void printNodeInfo(int nodeId) { + Vertex v = graph.getVertex(nodeId); + printInHasLink(v, 1); + printOutHasLink(v, 1); + + String title = getPageTitle(v); + double rank = getPageRank(v); + long inHasLink = computeInHasLink(v); + long outHasLink = computeOutHasLink(v); + + System.out.println("id:"+nodeId+" title:"+title+" rank:"+rank); + System.out.println("inHasLink:"+inHasLink+" outHasLink:"+outHasLink); + + + } + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/pagerank/WikiPage.java Wed Sep 05 11:56:21 2012 +0900 @@ -0,0 +1,94 @@ +package pagerank; + +import java.io.FileOutputStream; +import java.io.IOException; + +import com.tinkerpop.blueprints.Vertex; + +import pagerank.LinkToVertex; + +public class WikiPage { + + private String title; + private long id; + private Double rank; + private long outHasLink; + private long inHasLink; + + WikiPage() { + this.title = null; + this.id = -1; + this.rank = -1.0; + this.outHasLink = 0; + this.inHasLink = 0; + } + + WikiPage(Vertex v) { + this.title = (String) v.getProperty(LinkToVertex.PAGE_TITLE); + this.id = (Long) v.getId(); + this.rank = (Double)v.getProperty(LinkToVertex.PAGE_RANK); + this.outHasLink = 0; + this.inHasLink = 0; + } + + WikiPage(String title, long id, Double rank) { + this.title = title; + this.id = id; + this.rank = rank; + this.outHasLink = 0; + this.inHasLink = 0; + } + + String getTitle() { + return title; + } + + long getId() { + return id; + } + + double getRank() { + return rank; + } + + long getOutHasLink() { + return outHasLink; + } + + long getInHasLink() { + return inHasLink; + } + + void setTitle(String title) { + this.title = title; + } + + void setId(long id) { + this.id = id; + } + + void setRank(double rank) { + this.rank = rank; + } + + void setOutHasLink(long num) { + this.outHasLink = num; + } + + void setInHasLink(long num) { + this.inHasLink = num; + } + + void printInfo() { + System.out.println("id:"+id+" title:"+title+" rank:"+rank); + System.out.println("outHasLink:"+outHasLink+" inHasLink:"+inHasLink); + } + + void printInfo(FileOutputStream fos) throws IOException { + fos.write(("id:"+id+" title:"+title+" rank:"+rank+"\n").getBytes()); + fos.write(("outHasLink:"+outHasLink+" inHasLink:"+inHasLink+"\n").getBytes()); + fos.write(("\n").getBytes()); + fos.flush(); + } + +}
--- a/src/sample/CreateTinkerGraph.java Tue Sep 04 22:47:53 2012 +0900 +++ b/src/sample/CreateTinkerGraph.java Wed Sep 05 11:56:21 2012 +0900 @@ -1,44 +1,80 @@ package sample; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; + import com.tinkerpop.blueprints.Direction; import com.tinkerpop.blueprints.Edge; import com.tinkerpop.blueprints.Graph; import com.tinkerpop.blueprints.Vertex; import com.tinkerpop.blueprints.impls.tg.TinkerGraph; import com.tinkerpop.blueprints.impls.tg.TinkerGraphFactory; +import com.tinkerpop.blueprints.util.io.graphml.GraphMLReader; +import com.tinkerpop.blueprints.util.io.graphml.GraphMLWriter; public class CreateTinkerGraph { + public static final String filename = "./resources/tinkerpopDB"; + public static void main(String[] args) { - - createTest(); -// readTest(); + + + try { + + outputGraph(); + readGraph(); + + } catch (IOException e) { + e.printStackTrace(); + } + } + public static void outputGraph() throws IOException { + Graph graph = new TinkerGraph(); + FileOutputStream out = new FileOutputStream(new File(filename)); + + Vertex a = graph.addVertex(null); + Vertex b = graph.addVertex(null); + Vertex c = graph.addVertex(null); + Vertex d = graph.addVertex(null); + a.setProperty("name", "maro"); + b.setProperty("name", "Peter"); + c.setProperty("name", "smith"); + d.setProperty("name", "black"); + Edge e = graph.addEdge(null, a, b, "knows"); + Edge e2 = graph.addEdge(null, c, a, "knows"); + Edge e3 = graph.addEdge(null, d, b, "knows"); + System.out.println(e.getVertex(Direction.OUT).getProperty("name") + + "--" + e.getLabel() + "-->" + + e.getVertex(Direction.IN).getProperty("name")); + for (Edge edge : b.getEdges(Direction.IN, "knows")) { + Vertex v =edge.getVertex(Direction.OUT); + System.out.println(v.getProperty("name")); + } + + + GraphMLWriter.outputGraph(graph, out); + + } + + public static void readGraph() throws IOException { + Graph graph = new TinkerGraph(); + FileInputStream in = new FileInputStream(new File(filename)); - public static void outputGraph() { - Graph graph = new TinkerGraph("./resources/"); + GraphMLReader.inputGraph(graph, in); + Vertex aa = graph.getVertex("1"); + System.out.println("vertex " + aa.getId() + " has name " + aa.getProperty("name")); + if (aa.getProperty("aaa") != null) + System.out.println(aa.getProperty("aaa")); + for(Edge ee : aa.getEdges(Direction.OUT)) { + System.out.println(ee); + } } - public static void createTest() { - Graph graph = new TinkerGraph("/tmp/tinkergraph"); - Vertex a = graph.addVertex(null); - Vertex b = graph.addVertex(null); - a.setProperty("name", "mariko"); - b.setProperty("name", "Peter"); - Edge e = graph.addEdge(null, a, b, "knows"); - System.out.println(e.getVertex(Direction.OUT).getProperty("name") + "--" + e.getLabel() - + "-->" + e.getVertex(Direction.IN).getProperty("name")); - - } - public static void readTest() { - Graph graph = new TinkerGraph("/tmp/tinkergraph"); - Vertex aa = graph.getVertex("1"); - System.out.println("vertex " + aa.getId() + " has name " + aa.getProperty("name")); - for(Edge ee : aa.getEdges(Direction.OUT)) { - System.out.println(ee); - } - } }
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/xmlParser/CharReader.java Wed Sep 05 11:56:21 2012 +0900 @@ -0,0 +1,86 @@ +package xmlParser; + + +public class CharReader { + + final static char EOFchar = (char) 0; + + private String text; + private int textLength; + private int index; + + private final char LBRANK = '['; + private final char RBRANK = ']'; + private final char VERBAR = '|'; + private final char COLON= ':'; + + + CharReader() { + } + + public void setText(String str) { + text = str; + textLength = text.length(); + index = 0; + } + + + char nextChar() { + if (index < textLength) + return text.charAt(index++); + + return EOFchar; + + } + + String getToken() { + + int nextState = 1; + + StringBuffer buf = new StringBuffer(256); + char ch; + int index = -1; + while (true) { + ch = nextChar(); + if (ch == EOFchar) return null; + switch (nextState) { + case 1: + if (ch == LBRANK) + nextState = 2; + break; + case 2: + if (ch == LBRANK) + nextState = 3; + else + nextState = 1; + break; + case 3: + if (ch == RBRANK) { + nextState = 4; + } else if (ch == VERBAR) { + index = buf.length(); + buf.append(ch); + return buf.substring(0,index); + } else if (ch == COLON) { + index = -1; + buf.delete(0,buf.length()); + } else { + buf.append(ch); + } + break; + case 4: + if (ch == RBRANK) { + if (index == -1) { + return buf.toString(); + } else{ + return buf.substring(0,index); + } + } else { + return null; + } + } + + } + + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/xmlParser/TextTagParser.java Wed Sep 05 11:56:21 2012 +0900 @@ -0,0 +1,95 @@ +package xmlParser; + +import xmlParser.XmlTagObject; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.Stack; + +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +public class TextTagParser extends DefaultHandler { + + protected Stack<XmlTagObject> stack; + private XmlTagObject currentObj; + private Attributes currentAttr; + private String currentTag; + private String currentTitleName; + final static String TAGNAME_TITLE = "title"; + final static String TAGNAME_TEXT = "text"; + WikiLinkParser linkParser = new WikiLinkParser(); + + HashMap<String,HashSet<String>> hash = new HashMap<String,HashSet<String>>(); + HashSet<String> currentLinkHash = new HashSet<String>(); + + public TextTagParser() { + stack = new Stack<XmlTagObject>(); + } + + public HashMap<String,HashSet<String>> getHash() { + return hash; + } + + public void startDocument() { +// System.out.println("read start"); + } + + public void startElement(String uri, String localName, String qName, + Attributes attributes) throws SAXException { + currentTag = qName; + if (qName.equals(TAGNAME_TITLE)) { + currentObj = new XmlTagObject(attributes); + stack.push(currentObj); + } else if (qName.equals(TAGNAME_TEXT)) { + currentObj = new XmlTagObject(attributes); + stack.push(currentObj); + } else { + + } + } + + public void characters(char[] ch, int offset, int length) { + + String value = new String(ch, offset, length); + if (currentObj != null) { + currentObj.setValue(currentTag, currentAttr, value); + + if(currentTag.equals(TAGNAME_TITLE)) { + currentTitleName = value; + } + if(currentTag.equals(TAGNAME_TEXT)) { + HashSet<String> tmpHash = linkParser.parse(value); + if (tmpHash.size() <= 0) return; + for (String link: tmpHash) { + currentLinkHash.add(link); + } + } + } + } + + public void endElement(String uri, String localName, String qName) { + + if (currentObj == null) + return; + if (qName.equals(TAGNAME_TITLE)) { + stack.pop(); + } else if (qName.equals(TAGNAME_TEXT)) { + hash.put(currentTitleName, currentLinkHash); + currentLinkHash = new HashSet<String>(); + stack.pop(); + } else { + + } + if (stack.empty()) + currentObj = null; + else + currentObj = (XmlTagObject) stack.peek(); + } + + public void endDocument() { +// System.out.println("end reading file."); + } + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/xmlParser/WikiLinkParser.java Wed Sep 05 11:56:21 2012 +0900 @@ -0,0 +1,25 @@ +package xmlParser; + +import java.util.HashSet; + +public class WikiLinkParser { + + private CharReader reader; + + WikiLinkParser() { + reader = new CharReader(); + } + + public HashSet<String> parse(String text) { + HashSet<String> hash = new HashSet<String>(); + + reader.setText(text); + String str; + while ( (str = reader.getToken()) != null) { + hash.add(str); + } + + return hash; + + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/xmlParser/XmlTagObject.java Wed Sep 05 11:56:21 2012 +0900 @@ -0,0 +1,41 @@ +package xmlParser; + +import org.xml.sax.Attributes; + + + +public class XmlTagObject { + + private static Attributes attributes; + private static String currentTag; + private static String value; + + public XmlTagObject() { + + } + + public XmlTagObject(Attributes attr) { + attributes = attr; + } + + + public static void setValue(String tag, Attributes attr, String val) { + currentTag = tag; + attributes = attr; + value = val; + } + + public String getTag() { + return currentTag; + } + + public Attributes getAttributes() { + return attributes; + } + + public String getValue() { + return value; + } + + +}