2015-04-04 119 views
0

目标:我想根据公开可用的DBLP XML文件here制作DBLP数据库的Neo4j实例。我将数据库建模为一个二部图,其中作者在一个集合中,在另一个集合中出版物。要获得John Doe的所有合着者,必须进行以下Cypher查询:Neo4j:特殊字符和时间延迟

MATCH(a:Author) - [:WROTE] - >(publication)< - [:WROTE] - (b:Author)WHERE 。名称=“李四”返回不同的b”

问题1:似乎有一部分与特殊字符,例如电子,æ,我等一个问题,当我在我的浏览器解决http://localhost:7474/browser/,输入查询 “MATCH(一个:作者) - [:WROTE] - >(p)< - [:WROTE] - (b:作者)WHERE a.name = ”简·阿尔Telle公司“ RETURN DISTINCT b”,我应该得到58个独特的结果(共同作者),但我得到了79个结果。例如,合作者DaniëlPaulusma分成三个结果:“Dani”,“ë”,“l Paulusma”。但事实上,我也得到了合着者大卫·凯尔森的三个结果:“大卫·凯尔森”,“大卫”和“凯尔森”。所以问题不仅与特殊字符有关。

问题2:上述查询的结果返回90697毫秒。

编辑:使几个这样的查询结果返回2000毫秒至4000毫秒。

这里是所有代码:

切入点:Application.java:

package std; 

import java.io.File; 

import org.neo4j.graphdb.GraphDatabaseService; 
import org.neo4j.graphdb.Transaction; 
import org.neo4j.graphdb.factory.GraphDatabaseFactory; 
import org.neo4j.kernel.impl.util.FileUtils; 
import org.springframework.beans.factory.annotation.Autowired; 
import org.springframework.boot.CommandLineRunner; 
import org.springframework.boot.SpringApplication; 
import org.springframework.boot.autoconfigure.SpringBootApplication; 
import org.springframework.context.annotation.Bean; 
import org.springframework.context.annotation.Configuration; 
import org.springframework.data.neo4j.config.EnableNeo4jRepositories; 
import org.springframework.data.neo4j.config.Neo4jConfiguration; 
import org.springframework.data.neo4j.core.GraphDatabase; 

import java.io.FileInputStream; 
import java.io.IOException; 
import java.io.InputStream; 

import javax.xml.parsers.ParserConfigurationException; 
import javax.xml.parsers.SAXParser; 
import javax.xml.parsers.SAXParserFactory; 

import org.xml.sax.SAXException; 
import org.apache.xerces.util.SecurityManager; 

@SpringBootApplication 
public class Application implements CommandLineRunner { 

    @Configuration 
    @EnableNeo4jRepositories(basePackages = "std") 
    static class ApplicationConfig extends Neo4jConfiguration { 

     public ApplicationConfig() { 
      setBasePackage("std"); 
     } 

     @Bean 
     GraphDatabaseService graphDatabaseService() { 
      return new GraphDatabaseFactory().newEmbeddedDatabase("dblp.db"); 
     } 

    } 

    @Autowired 
    PublicationRepository publicationRepository; 

    @Autowired 
    GraphDatabase graphDatabase; 

    public void run(String... args) throws Exception { 

     Transaction tx = graphDatabase.beginTx(); 

     try { 

      SAXParserFactory parserFactory = SAXParserFactory.newInstance(); 
      SAXParser parser = parserFactory.newSAXParser(); 

      SecurityManager mgr = new SecurityManager(); 
      mgr.setEntityExpansionLimit(3100000); 
      parser.setProperty("http://apache.org/xml/properties/security-manager", mgr); 

      SaxHandler handler = new SaxHandler(publicationRepository, graphDatabase); 
      handler.setTransaction(tx); 
      parser.getXMLReader().setFeature("http://xml.org/sax/features/validation", true); 
      InputStream xmlInput = new FileInputStream("/Users/username/Documents/dblp.xml"); 
      parser.parse(xmlInput, handler); 

      tx.success(); 


     } catch (SAXException e) { 
      e.printStackTrace(); 
     } catch (IOException e) { 
      e.printStackTrace(); 
     } catch (ParserConfigurationException e) { 
      e.printStackTrace(); 
     } finally { 
      tx.close(); 
     } 

    } 

    public static void main(String[] args) throws Exception { 
     FileUtils.deleteRecursively(new File("dblp.db")); 
     SpringApplication.run(Application.class, args); 
    } 

} 

Author.java:

package std; 

import org.springframework.data.neo4j.annotation.GraphId; 
import org.springframework.data.neo4j.annotation.Indexed; 
import org.springframework.data.neo4j.annotation.NodeEntity; 
import org.springframework.data.neo4j.annotation.Query; 
import org.springframework.data.neo4j.support.index.IndexType; 

@NodeEntity 
public class Author { 

    @GraphId 
    private Long id; 

    @Indexed(indexName = "names", unique = true, indexType = IndexType.FULLTEXT) 
    private String name; 

    public Author() { 

    } 

    public Author(String name) { 

    } 

    @Override 
    public boolean equals(Object obj) { 
     if (this == obj) 
      return true; 

     if (obj == null) 
      return false; 

     if (this.getClass() != obj.getClass()) 
      return false; 

     Author other = (Author) obj; 

     if (this.id != null && this.name != null && other.id != null && other.name != null) { 
      if (this.id.equals(other.id) && this.name.equals(other.name)) 
       return true; 
     } else { 
      return true; 
     } 

     return false; 
    } 

    @Override 
    public int hashCode() { 
     return 31 * (this.id == null ? 1 : this.id.hashCode()) + 31 * (this.name == null ? 1 : this.name.hashCode()); 
    } 

    public Long getId() { 
     return id; 
    } 

    public void setId(Long id) { 
     this.id = id; 
    } 

    public String getName() { 
     return name; 
    } 

    public void setName(String name) { 
     this.name = name; 
    } 
} 

Publication.java:

package std; 

import java.io.Serializable; 
import java.util.HashSet; 
import java.util.Set; 

import org.neo4j.graphdb.Direction; 
import org.springframework.data.neo4j.annotation.GraphId; 
import org.springframework.data.neo4j.annotation.Indexed; 
import org.springframework.data.neo4j.annotation.NodeEntity; 
import org.springframework.data.neo4j.annotation.RelatedTo; 
import org.springframework.data.neo4j.support.index.IndexType; 

@NodeEntity 
public class Publication implements Serializable { 

    private static final long serialVersionUID = -6393545300391560520L; 

    @GraphId 
    Long nodeId; 

    private String type = ""; 
    private String key = ""; 
    private String mdate = ""; 
    private String publtype = ""; 
    private String reviewid = ""; 
    private String rating = ""; 

    @RelatedTo(type = "WROTE", direction = Direction.INCOMING) 
    private Set<Author> authors = new HashSet<Author>(); 
    private String editor = ""; 

    @Indexed(indexType = IndexType.FULLTEXT, indexName = "titles") 
    private String title = ""; 

    private String booktitle = ""; 
    private String pages = ""; 
    private String year = ""; 
    private String address = ""; 
    private String journal = ""; 
    private String volume = ""; 
    private String number = ""; 
    private String month = ""; 
    private String url = ""; 
    private String ee = ""; 
    private String cdrom = ""; 
    private String cite = ""; 
    private String publisher = ""; 
    private String note = ""; 
    private String crossref = ""; 
    private String isbn = ""; 
    private String series = ""; 
    private String school = ""; 
    private String chapter = ""; 

    public Publication() { 

    } 

    public void addAuthor(Author author) { 
     authors.add(author); 
    } 

    public Set<Author> getAuthors() { 
     return authors; 
    } 

    public void setAuthors(Set<Author> authors) { 
     this.authors = authors; 
    } 

    @Override 
    public String toString() { 
     return "TYPE: " + type + "\n" 
       + "KEY: " + key + "\n" 
       + "MDATE: " + mdate + "\n"; 
    } 

    public Long getNodeId() { 
     return nodeId; 
    } 

    public void setNodeId(Long nodeId) { 
     this.nodeId = nodeId; 
    } 

    public String getKey() { 
     return key; 
    } 

    public void setKey(String key) { 
     this.key = key; 
    } 

    public String getMdate() { 
     return mdate; 
    } 

    public void setMdate(String mdate) { 
     this.mdate = mdate; 
    } 

    public String getPubltype() { 
     return publtype; 
    } 

    public void setPubltype(String publtype) { 
     this.publtype = publtype; 
    } 

    public String getReviewid() { 
     return reviewid; 
    } 

    public void setReviewid(String reviewid) { 
     this.reviewid = reviewid; 
    } 

    public String getRating() { 
     return rating; 
    } 

    public void setRating(String rating) { 
     this.rating = rating; 
    } 

    public String getType() { 
     return type; 
    } 

    public void setType(String type) { 
     this.type = type; 
    } 

    public String getEditor() { 
     return editor; 
    } 

    public void setEditor(String editor) { 
     this.editor = editor; 
    } 

    public String getTitle() { 
     return title; 
    } 

    public void setTitle(String title) { 
     this.title = title; 
    } 

    public String getBooktitle() { 
     return booktitle; 
    } 

    public void setBooktitle(String booktitle) { 
     this.booktitle = booktitle; 
    } 

    public String getPages() { 
     return pages; 
    } 

    public void setPages(String pages) { 
     this.pages = pages; 
    } 

    public String getYear() { 
     return year; 
    } 

    public void setYear(String year) { 
     this.year = year; 
    } 

    public String getAddress() { 
     return address; 
    } 

    public void setAddress(String address) { 
     this.address = address; 
    } 

    public String getJournal() { 
     return journal; 
    } 

    public void setJournal(String journal) { 
     this.journal = journal; 
    } 

    public String getVolume() { 
     return volume; 
    } 

    public void setVolume(String volume) { 
     this.volume = volume; 
    } 

    public String getNumber() { 
     return number; 
    } 

    public void setNumber(String number) { 
     this.number = number; 
    } 

    public String getMonth() { 
     return month; 
    } 

    public void setMonth(String month) { 
     this.month = month; 
    } 

    public String getUrl() { 
     return url; 
    } 

    public void setUrl(String url) { 
     this.url = url; 
    } 

    public String getEe() { 
     return ee; 
    } 

    public void setEe(String ee) { 
     this.ee = ee; 
    } 

    public String getCdrom() { 
     return cdrom; 
    } 

    public void setCdrom(String cdrom) { 
     this.cdrom = cdrom; 
    } 

    public String getCite() { 
     return cite; 
    } 

    public void setCite(String cite) { 
     this.cite = cite; 
    } 

    public String getPublisher() { 
     return publisher; 
    } 

    public void setPublisher(String publisher) { 
     this.publisher = publisher; 
    } 

    public String getNote() { 
     return note; 
    } 

    public void setNote(String note) { 
     this.note = note; 
    } 

    public String getCrossref() { 
     return crossref; 
    } 

    public void setCrossref(String crossref) { 
     this.crossref = crossref; 
    } 

    public String getIsbn() { 
     return isbn; 
    } 

    public void setIsbn(String isbn) { 
     this.isbn = isbn; 
    } 

    public String getSeries() { 
     return series; 
    } 

    public void setSeries(String series) { 
     this.series = series; 
    } 

    public String getSchool() { 
     return school; 
    } 

    public void setSchool(String school) { 
     this.school = school; 
    } 

    public String getChapter() { 
     return chapter; 
    } 

    public void setChapter(String chapter) { 
     this.chapter = chapter; 
    } 

} 

PublicationRepository.java:

package std; 

import org.springframework.data.neo4j.repository.GraphRepository; 

public interface PublicationRepository extends GraphRepository<Publication> { 

    Publication findByTitle(String title); 

} 

SaxHandler.java:

package std; 

import java.util.ArrayList; 
import java.util.List; 
import java.util.Stack; 

import org.neo4j.graphdb.Transaction; 
import org.springframework.beans.factory.annotation.Autowired; 
import org.springframework.data.neo4j.core.GraphDatabase; 
import org.xml.sax.Attributes; 
import org.xml.sax.SAXException; 
import org.xml.sax.helpers.DefaultHandler; 

public class SaxHandler extends DefaultHandler { 

    private Stack<String> qNameStack = new Stack<String>(); 
    private Stack<Publication> publicationStack = new Stack<Publication>(); 
    private String publicationType = null; 
    private PublicationRepository publicationRepository = null; 
    private Publication publication = null; 
    private Author author = null; 
    private String currentElement = null; 
    private String value = null; 
    private boolean insideTitle = false; 

    private GraphDatabase graphDatabase; 
    private Transaction tx = null; 

    private static int counter = 0; 

    public List<Publication> getPublications() { 
     return publications; 
    } 

    @Autowired 
    public SaxHandler(PublicationRepository publicationRepository, GraphDatabase graphDatabase) { 
     this.publicationRepository = publicationRepository; 
     this.graphDatabase = graphDatabase; 
    } 

    public void setTransaction(Transaction tx) { 
     this.tx = tx; 
    } 

    public void startElement(String uri, String localName, String tagName, Attributes attributes) throws SAXException { 
     storeTagName(tagName); 
     createEmptyPublication(); 
     testIfEnteringTitle(tagName); 
     testIfPublicationTag(tagName); 
     testOnAttributes(tagName, attributes); 
    } 

    public void endElement(String uri, String localName, String tagName) throws SAXException { 
     testIfLeavingTitle(tagName); 
     removeNameOfLastVisitedTag(); 
     testIfFinishedCreatingPublication(tagName); 
    } 

    public void characters(char ch[], int start, int length) throws SAXException { 
     storeContentsInCurrentPublication(ch, start, length);  
    } 

    /** 
    * Store the contents of the current tag in the corresponding field 
    * of the current publication. 
    * 
    * @param ch 
    * @param start 
    * @param length 
    */ 
    private void storeContentsInCurrentPublication(char ch[], int start, int length) { 
     value = new String(ch,start,length).trim(); 

     if (value.length() == 0) 
      return; 

     publication = publicationStack.peek(); 
     currentElement = qNameStack.peek(); 

     if ("author".equals(currentElement)) {   
      author = new Author(); 
      author.setName(value); 
      publication.addAuthor(author); 
     } else if ("editor".equals(currentElement)) { 
      publication.setEditor(value); 
     } else if ("title".equals(currentElement)) { 
      String title = publication.getTitle() + value; 
      publication.setTitle(title); 
     } else if ("booktitle".equals(currentElement)) { 
      publication.setBooktitle(value); 
     } else if ("pages".equals(currentElement)) { 
      publication.setPages(value); 
     } else if ("year".equals(currentElement)) { 
      publication.setYear(value); 
     } else if ("address".equals(currentElement)) { 
      publication.setAddress(value); 
     } else if ("journal".equals(currentElement)) { 
      publication.setJournal(value); 
     } else if ("volume".equals(currentElement)) { 
      publication.setVolume(value); 
     } else if ("number".equals(currentElement)) { 
      publication.setNumber(value); 
     } else if ("month".equals(currentElement)) { 
      publication.setMonth(value); 
     } else if ("url".equals(currentElement)) { 
      publication.setUrl(value); 
     } else if ("ee".equals(currentElement)) { 
      publication.setEe(value); 
     } else if ("cdrom".equals(currentElement)) { 
      publication.setCdrom(value); 
     } else if ("cite".equals(currentElement)) { 
      publication.setCite(value); 
     } else if ("publisher".equals(currentElement)) { 
      publication.setPublisher(value); 
     } else if ("note".equals(currentElement)) { 
      publication.setNote(value); 
     } else if ("crossref".equals(currentElement)) { 
      publication.setCrossref(value); 
     } else if ("isbn".equals(currentElement)) { 
      publication.setIsbn(value); 
     } else if ("series".equals(currentElement)) { 
      publication.setSeries(value); 
     } else if ("school".equals(currentElement)) { 
      publication.setSchool(value); 
     } else if ("chapter".equals(currentElement)) { 
      publication.setChapter(value); 
     } else if ("i".equals(currentElement) && isInsideTitleOrBooktitle()) { 
      String title = publication.getTitle() + "<i>" + value + "</i>"; 
      publication.setTitle(title); 
     } else if ("sup".equals(currentElement) && isInsideTitleOrBooktitle()) { 
      String title = publication.getTitle() + "<sup>" + value + "</sup>"; 
      publication.setTitle(title); 
     } else if ("sub".equals(currentElement) && isInsideTitleOrBooktitle()) { 
      String title = publication.getTitle() + "<sub>" + value + "</sub>"; 
      publication.setTitle(title); 
     } else if ("tt".equals(currentElement) && isInsideTitleOrBooktitle()) { 
      String title = publication.getTitle() + "<tt>" + value + "</tt>"; 
      publication.setTitle(title); 
     } else if ("ref".equals(currentElement) && isInsideTitleOrBooktitle()) { 
      String title = publication.getTitle() + "<ref>" + value + "</ref>"; 
      publication.setTitle(title); 
     } 
    } 

    /** 
    * Returns true if and only if the parser is inside 
    * either a title or booktitle tag. 
    * 
    * @return true if and only if the parser is inside 
    * either a title or booktitle tag. 
    */ 
    private boolean isInsideTitleOrBooktitle() { 
     return insideTitle; 
    } 

    /** 
    * Checks if the parser is finished with one whole 
    * publication. If so, the publication is stored in 
    * the database. 
    * 
    * @param tagName 
    */ 
    private void testIfFinishedCreatingPublication(String tagName) { 
     if (publicationType.equals(tagName)) { 
      publicationRepository.save(publicationStack.pop()); 
      if (++counter % 1000 == 0) { 
       System.out.println("Counter = " + counter); 
       tx.success(); 
       tx.close(); 
       tx = graphDatabase.beginTx(); 
      } 
     } 
    } 

    /** 
    * Removes the tag name of the last visited tag 
    * from the stack. 
    */ 
    private void removeNameOfLastVisitedTag() { 
     qNameStack.pop(); 
    } 

    /** 
    * Store the tag name on the stack. 
    * 
    * @param tagName 
    */ 
    private void storeTagName(String tagName) { 
     qNameStack.push(tagName); 
    } 

    /** 
    * Create an empty publication to be filled with data. 
    */ 
    private void createEmptyPublication() { 
     publication = new Publication(); 
    } 

    /** 
    * Checks if the parser is entering a title or booktitle tag. If so 
    * is the case, then a boolean flag is set. 
    * 
    * @param tagName the name of the current tag 
    */ 
    private void testIfLeavingTitle(String tagName) { 
     if ("title".equals(tagName) || "booktitle".equals(tagName)) 
      insideTitle = false; 
    } 

    /** 
    * Checks if the parser is entering a title or booktitle tag. If so 
    * is the case, then a boolean flag is set. 
    * 
    * @param tagName the name of the current tag 
    */ 
    private void testIfEnteringTitle(String tagName) { 
     if ("title".equals(tagName) || "booktitle".equals(tagName)) 
      insideTitle = true; 
    } 

    /** 
    * Checks if the current tag is one of: 
    * - article, inproceedings, proceedings, book, incollection, phdthesis, mastersthesis, www 
    * If the current tag is one of these, then the type of the current publication is set 
    * to the corresponding value. 
    * 
    * @param tagName the name of the current tag. 
    */ 
    private void testIfPublicationTag(String tagName) { 
     if ("article".equals(tagName)) { 
      publication.setType("article"); 
     } else if ("inproceedings".equals(tagName)) { 
      publication.setType("inproceedings"); 
     } else if ("proceedings".equals(tagName)) { 
      publication.setType("proceedings"); 
     } else if ("book".equals(tagName)) { 
      publication.setType("book"); 
     } else if ("incollection".equals(tagName)) { 
      publication.setType("incollection"); 
     } else if ("phdthesis".equals(tagName)) { 
      publication.setType("phdthesis"); 
     } else if ("mastersthesis".equals(tagName)) { 
      publication.setType("mastersthesis"); 
     } else if ("www".equals(tagName)) { 
      publication.setType("www"); 
     } 
    } 

    /** 
    * Checks if the tag has any attributes. If so, the existing attribute 
    * values are stored. 
    * 
    * A tag with attributes is one of: 
    * - article, inproceedings, proceedings, book, incollection, phdthesis, mastersthesis, www 
    * 
    * @param tagName the name of the current tag 
    * @param attributes the attributes of the current tag, if any 
    */ 
    private void testOnAttributes(String tagName, Attributes attributes) { 
     if (attributes.getLength() > 0) {   
      publicationType = tagName; 

      if (attributes.getValue("key") != null) { 
       publication.setKey(attributes.getValue("key")); 
      } 
      if (attributes.getValue("mdate") != null) { 
       publication.setMdate(attributes.getValue("mdate")); 
      } 
      if (attributes.getValue("publtype") != null) { 
       publication.setMdate(attributes.getValue("publtype")); 
      } 
      if (attributes.getValue("reviewid") != null) { 
       publication.setMdate(attributes.getValue("reviewid")); 
      } 
      if (attributes.getValue("rating") != null) { 
       publication.setMdate(attributes.getValue("rating")); 
      } 

      publicationStack.push(publication); 
     } 
    } 
} 

的pom.xml:

<?xml version="1.0" encoding="UTF-8"?> 
<project xmlns="http://maven.apache.org/POM/4.0.0" 
      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
      xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 

    <modelVersion>4.0.0</modelVersion> 

    <groupId>com.dblp</groupId> 
    <artifactId>graphdbcreator</artifactId> 
    <version>0.1.0</version> 

    <parent> 
     <groupId>org.springframework.boot</groupId> 
     <artifactId>spring-boot-starter-parent</artifactId> 
     <version>1.2.2.RELEASE</version> 
    </parent> 

    <dependencies> 
     <dependency> 
      <groupId>org.springframework.boot</groupId> 
      <artifactId>spring-boot-starter</artifactId> 
     </dependency> 
     <dependency> 
      <groupId>org.springframework</groupId> 
      <artifactId>spring-context</artifactId> 
     </dependency> 
     <dependency> 
      <groupId>org.springframework</groupId> 
      <artifactId>spring-tx</artifactId> 
     </dependency> 
     <dependency> 
      <groupId>org.springframework.data</groupId> 
      <artifactId>spring-data-neo4j</artifactId> 
     </dependency> 
     <dependency> 
      <groupId>org.hibernate</groupId> 
      <artifactId>hibernate-validator</artifactId> 
     </dependency> 
     <dependency> 
      <groupId>javax.el</groupId> 
      <artifactId>javax.el-api</artifactId> 
      <version>2.2.4</version> 
     </dependency> 
     <dependency> 
      <groupId>xerces</groupId> 
      <artifactId>xercesImpl</artifactId> 
      <version>2.8.0</version> 
     </dependency> 
    </dependencies> 

    <build> 
     <plugins> 
      <plugin> 
       <groupId>org.springframework.boot</groupId> 
       <artifactId>spring-boot-maven-plugin</artifactId> 
      </plugin> 
     </plugins> 
    </build> 

    <repositories> 
     <repository> 
      <id>spring-releases</id> 
      <name>Spring Releases</name> 
      <url>https://repo.spring.io/libs-release</url> 
     </repository> 
     <repository> 
      <id>neo4j</id> 
      <name>Neo4j</name> 
      <url>http://m2.neo4j.org/</url> 
     </repository> 
    </repositories> 

</project> 

回答

0

看来我的SAX处理程序有缺陷。例如,给定一个标记<author>Daniël Paulusma</author>,解析器将对“Dani”的characters()方法进行一次调用,对“ë”执行另一次对characters()的调用,并对“l Paulusma”执行第三次对characters()的调用。我在这里找到了一个简单的解决方案:SAX parsing and special characters

0

对于问题1尝试建立与分析仪适合您的需要手动索引。有关如何使用自定义分析器的详细信息,请参阅http://blog.armbruster-it.de/2014/10/deep-dive-on-fulltext-indexing-with-neo4j/

另一种选择是为使用应用程序所产生侧逻辑和在二次属性存储词干化名称。

第三个选项是增加指的是同样的人笔者节点之间的“相似”的关系。

关于问题2:一定要对作者的name属性的索引:

CREATE INDEX ON :Author(name) 

的后续调用上查询时间的差异很容易与缓存解释,阅读更多的http://neo4j.com/docs/stable/configuration-caches.html

+0

我会进一步研究问题1的解决方案。关于问题2的解决方案,当我提供注解@Indexed(indexName =“names”,unique = true,indexType = IndexType.FULLTEXT)时,是否需要此解决方案? Author类中名称属性的顶部?问题2的 – GCOverhead 2015-04-04 10:35:10

+0

:只需在属性名称中使用'@Indexed(unique = true)'。 – 2015-04-04 10:42:23