diff --git a/src/main/java/org/aksw/agdistis/algorithm/CandidateUtil.java b/src/main/java/org/aksw/agdistis/algorithm/CandidateUtil.java index c542401..088baf8 100755 --- a/src/main/java/org/aksw/agdistis/algorithm/CandidateUtil.java +++ b/src/main/java/org/aksw/agdistis/algorithm/CandidateUtil.java @@ -26,540 +26,552 @@ public class CandidateUtil { - private static Logger log = LoggerFactory.getLogger(CandidateUtil.class); - private String nodeType; - public void setNodeType(String nodeType) { - this.nodeType = nodeType; - } - - private TripleIndex index; - private TripleIndexContext index2; - private NGramDistance nGramDistance; - private CorporationAffixCleaner corporationAffixCleaner; - private DomainWhiteLister domainWhiteLister; - private boolean popularity; - private boolean context; - private boolean acronym; - private boolean commonEntities; - private String algorithm; - - public CandidateUtil() throws IOException { - Properties prop = new Properties(); - InputStream input = CandidateUtil.class.getResourceAsStream("/config/agdistis.properties"); - prop.load(input); - - String envNodeType = System.getenv("AGDISTIS_NODE_TYPE"); - this.nodeType = envNodeType != null ? envNodeType : prop.getProperty("nodeType"); - String envNgramDistance = System.getenv("AGDISTIS_NGRAM_DISTANCE"); - this.nGramDistance = new NGramDistance( - Integer.valueOf(envNgramDistance != null ? envNgramDistance : prop.getProperty("ngramDistance"))); - this.index = new TripleIndex(); - String envContext = System.getenv("AGDISTIS_CONTEXT"); - this.context = Boolean.valueOf(envContext != null ? envContext : prop.getProperty("context")); - if (context == true) { // in case the index by context exist - this.index2 = new TripleIndexContext(); - } - this.corporationAffixCleaner = new CorporationAffixCleaner(); - this.domainWhiteLister = new DomainWhiteLister(index); - String envPopularity = System.getenv("AGDISTIS_POPULARITY"); - this.popularity = Boolean.valueOf(envPopularity != null ? envPopularity : prop.getProperty("popularity")); - String envAcronym = System.getenv("AGDISTIS_ACRONYM"); - this.acronym = Boolean.valueOf(envAcronym != null ? envAcronym : prop.getProperty("acronym")); - String envCommonEntities = System.getenv("AGDISTIS_COMMON_ENTITIES"); - this.commonEntities = Boolean - .valueOf(envCommonEntities != null ? envCommonEntities : prop.getProperty("commonEntities")); - String envAlgorithm = System.getenv("AGDISTIS_ALGORITHM"); - this.algorithm = envAlgorithm != null ? envAlgorithm : prop.getProperty("algorithm"); - } - - public void setIndex(TripleIndex index) { - try { - this.index = index; - this.domainWhiteLister = new DomainWhiteLister(index); - } catch (IOException e) { - log.error("Could not set new index in Candidate Util due to DomainWhiteLister"); - e.printStackTrace(); - } - - } - - public void insertCandidatesIntoText(DirectedSparseGraph graph, Document document, - double threshholdTrigram, Boolean heuristicExpansionOn) throws IOException { - NamedEntitiesInText namedEntities = document.getNamedEntitiesInText(); - String text = document.DocumentText().getText(); - HashMap nodes = new HashMap(); - - // used for heuristic label expansion start with longest Named Entities - Collections.sort(namedEntities.getNamedEntities(), new NamedEntityLengthComparator()); - Collections.reverse(namedEntities.getNamedEntities()); - String entities = ""; - for (NamedEntityInText namedEntity : namedEntities) { - entities = entities.concat(" ".concat(namedEntity.getLabel())); - } - log.info("entities" + entities); - HashSet heuristicExpansion = new HashSet(); - for (NamedEntityInText entity : namedEntities) { - String label = text.substring(entity.getStartPos(), entity.getEndPos()); - - log.info("\tLabel: " + label); - long start = System.currentTimeMillis(); - - if (heuristicExpansionOn) { - label = heuristicExpansion(heuristicExpansion, label); - } - checkLabelCandidates(graph, threshholdTrigram, nodes, entity, label, false, entities); - - log.info("\tGraph size: " + graph.getVertexCount() + " took: " + (System.currentTimeMillis() - start) - + " ms"); - } - } - - private String heuristicExpansion(HashSet heuristicExpansion, String label) { - String tmp = label; - boolean expansion = false; - for (String key : heuristicExpansion) { - if (key.contains(label)) { - // take the shortest possible expansion - if (tmp.length() > key.length() && tmp != label) { - tmp = key; - expansion = true; - log.debug("Heuristic expansion: " + label + "-->" + key); - } - if (tmp.length() < key.length() && tmp == label) { - tmp = key; - expansion = true; - log.debug("Heuristic expansion: " + label + "-->" + key); - } - } - } - label = tmp; - if (!expansion) { - heuristicExpansion.add(label); - } - return label; - } - - public void addNodeToGraph(DirectedSparseGraph graph, HashMap nodes, - NamedEntityInText entity, Triple c, String candidateURL) throws IOException { - Node currentNode = new Node(candidateURL, 0, 0, algorithm); - log.debug("CandidateURL: " + candidateURL); - // candidates are connected to a specific label in the text via their - // start position - if (!graph.addVertex(currentNode)) { - int st = entity.getStartPos(); - if (nodes.get(candidateURL) != null) { - nodes.get(candidateURL).addId(st); - } else { - log.error("This vertex couldn't be added because of an bug in Jung: " + candidateURL); - } - } else { - currentNode.addId(entity.getStartPos()); - nodes.put(candidateURL, currentNode); - } - } - - private void checkLabelCandidates(DirectedSparseGraph graph, double threshholdTrigram, - HashMap nodes, NamedEntityInText entity, String label, boolean searchInSurfaceForms, - String entities) throws IOException { - List candidates = new ArrayList(); - List acronymCandidatesTemp = new ArrayList(); - List acronymCandidatesTemp2 = new ArrayList(); - List candidatesContext = new ArrayList(); - List candidatesContextbyLabel = new ArrayList(); - List linkedsbyContext = new ArrayList(); - int countFinalCandidates = 0; - - PreprocessingNLP nlp = new PreprocessingNLP(); - // Label treatment - label = corporationAffixCleaner.cleanLabelsfromCorporationIdentifier(label); - log.info("Label:" + label); - label = nlp.Preprocessing(label); - // label treatment finished -> - // searchByAcronym - if (acronym == true) { - if (label.equals(label.toUpperCase()) && label.length() <= 4) { - acronymCandidatesTemp = searchbyAcronym(label, searchInSurfaceForms, entity.getType()); - for (Triple triple : acronymCandidatesTemp) { - acronymCandidatesTemp2 = searchAcronymByLabel(triple.getSubject(), searchInSurfaceForms, - entity.getType()); - for (Triple triple2 : acronymCandidatesTemp2) { - if (nGramDistance.getDistance(triple.getSubject(), triple2.getObject()) > threshholdTrigram) { - // follow redirect - triple2.setSubject(redirect(triple2.getSubject())); - // iff it is a disambiguation resource, skip it - if (isDisambiguationResource(triple2.getSubject())) { - continue; - } - if (commonEntities == true) { - addNodeToGraph(graph, nodes, entity, triple2, triple2.getSubject()); - countFinalCandidates++; - } else { - if (domainWhiteLister.fitsIntoDomain(triple2.getSubject())) { - addNodeToGraph(graph, nodes, entity, triple2, triple2.getSubject()); - countFinalCandidates++; - } - } - } - } - acronymCandidatesTemp2.clear(); - } - log.info("\t\tnumber of candidates by acronym: " + countFinalCandidates); - } - } - // searchByAcronymFinished - - if (countFinalCandidates == 0) { - candidates = searchCandidatesByLabel(label, searchInSurfaceForms, "", popularity); - if (searchInSurfaceForms) { - log.info("\t\tnumber of candidates by SF label: " + candidates.size()); - } else { - log.info("\t\tnumber of candidates by main label: " + candidates.size()); - } - - if (candidates.size() == 0) { - log.info("\t\t\tNo candidates for: " + label); - if (label.endsWith("'s")) { - // removing plural s - label = label.substring(0, label.lastIndexOf("'s")); - candidates = searchCandidatesByLabel(label, searchInSurfaceForms, "", popularity); - log.info("\t\t\tEven not with expansion"); - } else if (label.endsWith("s")) { - // removing genitiv s - label = label.substring(0, label.lastIndexOf("s")); - candidates = searchCandidatesByLabel(label, searchInSurfaceForms, "", popularity); - log.info("\t\t\tEven not with expansion"); - } - } - // If the set of candidates is still empty, here we apply stemming - // technique - if (candidates.isEmpty()) { - Stemming stemmer = new Stemming(); - String temp = stemmer.stemming(label); - candidates = searchCandidatesByLabel(temp, searchInSurfaceForms, "", popularity); - log.info("\t\tnumber of all candidates by stemming: " + candidates.size()); - } - // Here starts the similarity by trigram - boolean added = false; - for (Triple c : candidates) { - log.info("Candidate triple to check: " + c); - String candidateURL = c.getSubject(); - String surfaceForm = c.getObject(); - surfaceForm = nlp.Preprocessing(surfaceForm); - // rule of thumb: no year numbers in candidates - if (candidateURL.startsWith(nodeType)) { - // if it is a disambiguation resource, skip it - // trigram similarity - if (c.getPredicate().equals("http://www.w3.org/2000/01/rdf-schema#label")) { - if (nGramDistance.getDistance(surfaceForm, label) < 1.0) {// Here - // we - // set - // the - // similarity - // as - // maximum - // because - // rfds:label - // refers - // to - // the - // main - // reference - // of - // a - // given - // resource - continue; - } - } else if (!c.getPredicate().equals("http://www.w3.org/2000/01/rdf-schema#label")) { // Here - // the - // similarity - // is - // in - // accordance - // with - // the - // user's - // choice. - if (nGramDistance.getDistance(surfaceForm, label) < threshholdTrigram) { - continue; - } - } - // follow redirect - candidateURL = redirect(candidateURL); - if (isDisambiguationResource(candidateURL)) { - log.info("CandidateURL" + candidateURL); - continue; - } - if (commonEntities == true) { // Being able to get all kinds - // of resource not only - // Person, Organization, - // Location - addNodeToGraph(graph, nodes, entity, c, candidateURL); - added = true; - countFinalCandidates++; - } else { - if (domainWhiteLister.fitsIntoDomain(candidateURL)) { - addNodeToGraph(graph, nodes, entity, c, candidateURL); - added = true; - countFinalCandidates++; - } - } - } - } - // Looking by context starts here. - if (!added && !searchInSurfaceForms && context == true) { - log.info("searchByContext"); - candidatesContext = searchCandidatesByContext(entities, label); // looking - // for - // all - // entities - // together - log.info("\t\tnumber of candidates by context: " + candidatesContext.size()); - - // taking all possibles SF for each resource found. - if (candidatesContext != null) { - for (Triple triple : candidatesContext) { - String url = nodeType + triple.getPredicate(); - candidatesContextbyLabel.addAll(searchCandidatesByUrl(url, searchInSurfaceForms)); - } - } - // Here, we apply two filters for increasing the quality of - // possible candidates - for (Triple c : candidatesContextbyLabel) { - log.debug("Candidate triple to check: " + c); - String candidateURL = c.getSubject(); - String cleanCandidateURL = candidateURL.replace(nodeType, ""); - cleanCandidateURL = nlp.Preprocessing(cleanCandidateURL); - if (candidateURL.startsWith(nodeType)) { - // trigram similarity over the URIS - if (nGramDistance.getDistance(cleanCandidateURL, label) < 0.3) { - continue; - } - // finding direct connections - for (Triple temp : candidatesContext) { - String candidateTemp = nodeType + temp.getPredicate(); - linkedsbyContext.addAll(searchbyConnections(candidateURL, candidateTemp)); - } - // Only resources which have connections with others are - // treated as possible candidates. - if (linkedsbyContext.size() < 1) { - continue; - } - // follow redirect - candidateURL = redirect(candidateURL); - - // if it is a disambiguation resource, skip it - if (isDisambiguationResource(candidateURL)) { - continue; - } - // Enabling more types of entities as the previous step. - if (commonEntities == true) { - addNodeToGraph(graph, nodes, entity, c, candidateURL); - added = true; - countFinalCandidates++; - } else { - if (domainWhiteLister.fitsIntoDomain(candidateURL)) { - addNodeToGraph(graph, nodes, entity, c, candidateURL); - added = true; - countFinalCandidates++; - } - } - } - linkedsbyContext.clear(); - } - } - // Looking for the given label among the set of surface forms. - if (!added && !searchInSurfaceForms) { - log.info("Search using SF from disambiguation, redirects and from anchors web pages"); - checkLabelCandidates(graph, threshholdTrigram, nodes, entity, label, true, entities); - } - - } - log.info("\t\tnumber of final candidates " + countFinalCandidates); - } - - private ArrayList searchCandidatesByLabel(String label, boolean searchInSurfaceFormsToo, String type, - boolean popularity) { - ArrayList tmp = new ArrayList(); - ArrayList tmp2 = new ArrayList(); - ArrayList finalTmp = new ArrayList(); - ArrayList candidatesScore = new ArrayList(); - - if (popularity) { // Frequency of entities. - tmp.addAll(index.search(null, "http://www.w3.org/2000/01/rdf-schema#label", label, 500)); - if (searchInSurfaceFormsToo) { - tmp.clear(); - tmp.addAll(index.search(null, "http://www.w3.org/2004/02/skos/core#altLabel", label, 500)); - } - - for (Triple c : tmp) { - tmp2.add(new Triple(c.getSubject(), c.getPredicate(), c.getObject())); - String uri = c.getSubject().replace(nodeType, ""); - candidatesScore = searchCandidatesByScore(uri); - c.setPredicate(c.getObject()); - if (candidatesScore.isEmpty()) { - c.setObject("1"); - } else { - c.setObject(candidatesScore.get(0).getObject()); - } - } - - Collections.sort(tmp); - - if (tmp.size() < 100) { - for (Triple triple : tmp.subList(0, tmp.size())) { - for (Triple triple2 : tmp2) { - if (triple.getSubject().equals(triple2.getSubject()) - && triple.getPredicate().equals(triple2.getObject())) { - finalTmp.add(triple2); - continue; - } - - } - } - - } else if (tmp.size() >= 100) { - for (Triple triple : tmp.subList(0, 100)) { - for (Triple triple2 : tmp2) { - if (triple.getSubject().equals(triple2.getSubject()) - && triple.getPredicate().equals(triple2.getObject())) { - finalTmp.add(triple2); - continue; - } - - } - } - - } - return finalTmp; - } else { - tmp.addAll(index.search(null, "http://www.w3.org/2000/01/rdf-schema#label", label)); - if (searchInSurfaceFormsToo) { - tmp.clear(); - tmp.addAll(index.search(null, "http://www.w3.org/2004/02/skos/core#altLabel", label)); - } - return tmp; - } - } - - public ArrayList searchbyAcronym(String label, boolean searchInSurfaceFormsToo, String type) { - ArrayList tmp = new ArrayList(); - tmp.addAll(index.search(null, "http://dbpedia.org/property/acronym", label, 100)); - return tmp; - } - - public ArrayList searchAcronymByLabel(String label, boolean searchInSurfaceFormsToo, String type) { - ArrayList tmp = new ArrayList(); - tmp.addAll(index.search(null, "http://www.w3.org/2000/01/rdf-schema#label", label, 100)); - return tmp; - } - - ArrayList searchCandidatesByContext(String entities, String label) { - ArrayList tmp = new ArrayList(); - tmp.addAll(index2.search(entities, label, null, 100)); - - return tmp; - } - - ArrayList searchCandidatesByScore(String label) { - ArrayList tmp = new ArrayList(); - tmp.addAll(index2.search(null, label, null)); - - return tmp; - } - - ArrayList searchbyConnections(String uri, String uri2) { - ArrayList tmp = new ArrayList(); - tmp.addAll(index.search(uri, null, uri2)); - - return tmp; - } - - ArrayList searchCandidatesByUrl(String url, boolean searchInSurfaceFormsToo) { - ArrayList tmp = new ArrayList(); - ArrayList tmp2 = new ArrayList(); - ArrayList finalTmp = new ArrayList(); - ArrayList candidatesScore = new ArrayList(); - - if (popularity) { - tmp.addAll(index.search(url, "http://www.w3.org/2000/01/rdf-schema#label", null, 500)); - - for (Triple c : tmp) { - tmp2.add(new Triple(c.getSubject(), c.getPredicate(), c.getObject())); - String uri = c.getSubject().replace(nodeType, ""); - candidatesScore = searchCandidatesByScore(uri); - c.setPredicate(c.getObject()); - if (candidatesScore.isEmpty()) { - c.setObject("1"); - } else { - c.setObject(candidatesScore.get(0).getObject()); - } - } - - Collections.sort(tmp); - - if (tmp.size() < 100) { - for (Triple triple : tmp.subList(0, tmp.size())) { - for (Triple triple2 : tmp2) { - if (triple.getSubject().equals(triple2.getSubject()) - && triple.getPredicate().equals(triple2.getObject())) { - finalTmp.add(triple2); - continue; - } - - } - } - - } else if (tmp.size() >= 100) { - for (Triple triple : tmp.subList(0, 100)) { - for (Triple triple2 : tmp2) { - if (triple.getSubject().equals(triple2.getSubject()) - && triple.getPredicate().equals(triple2.getObject())) { - finalTmp.add(triple2); - continue; - } - - } - } - - } - return finalTmp; - } else { - tmp.addAll(index.search(url, "http://www.w3.org/2000/01/rdf-schema#label", null)); - return tmp; - } - } - - private boolean isDisambiguationResource(String candidateURL) { - List tmp = index.search(candidateURL, "http://dbpedia.org/ontology/wikiPageDisambiguates", null); - if (tmp.isEmpty()) { - return false; - } else { - return true; - } - } - - private String redirect(String candidateURL) { - if (candidateURL == null) { - return candidateURL; - } - List redirect = index.search(candidateURL, "http://dbpedia.org/ontology/wikiPageRedirects", null); - if (redirect.size() == 1) { - return redirect.get(0).getObject(); - } else if (redirect.size() > 1) { - log.error("Several redirects detected for :" + candidateURL); - return candidateURL; - } else { - return candidateURL; - } - } - - public void close() throws IOException { - index.close(); - } - - public TripleIndex getIndex() { - return index; - } + private static Logger log = LoggerFactory.getLogger(CandidateUtil.class); + private String nodeType; + + public void setNodeType(final String nodeType) { + this.nodeType = nodeType; + } + + private TripleIndex index; + private TripleIndexContext index2; + private final NGramDistance nGramDistance; + private final CorporationAffixCleaner corporationAffixCleaner; + private DomainWhiteLister domainWhiteLister; + private final boolean popularity; + private final boolean context; + private final boolean acronym; + private final boolean commonEntities; + private final String algorithm; + + public CandidateUtil(final String file) throws IOException { + final ClassLoader loader = Thread.currentThread().getContextClassLoader(); + final InputStream is = loader.getResourceAsStream(file); + + final Properties prop = new Properties(); + prop.load(is); + + final String envNodeType = System.getenv("AGDISTIS_NODE_TYPE"); + nodeType = envNodeType != null ? envNodeType : prop.getProperty("nodeType"); + final String envNgramDistance = System.getenv("AGDISTIS_NGRAM_DISTANCE"); + nGramDistance = new NGramDistance(Integer + .valueOf(envNgramDistance != null ? envNgramDistance : prop.getProperty("ngramDistance"))); + index = new TripleIndex(file); + final String envContext = System.getenv("AGDISTIS_CONTEXT"); + context = Boolean.valueOf(envContext != null ? envContext : prop.getProperty("context")); + if (context == true) { // in case the index by context exist + index2 = new TripleIndexContext(); + } + corporationAffixCleaner = new CorporationAffixCleaner(); + domainWhiteLister = new DomainWhiteLister(index); + final String envPopularity = System.getenv("AGDISTIS_POPULARITY"); + popularity = + Boolean.valueOf(envPopularity != null ? envPopularity : prop.getProperty("popularity")); + final String envAcronym = System.getenv("AGDISTIS_ACRONYM"); + acronym = Boolean.valueOf(envAcronym != null ? envAcronym : prop.getProperty("acronym")); + final String envCommonEntities = System.getenv("AGDISTIS_COMMON_ENTITIES"); + commonEntities = Boolean.valueOf( + envCommonEntities != null ? envCommonEntities : prop.getProperty("commonEntities")); + final String envAlgorithm = System.getenv("AGDISTIS_ALGORITHM"); + algorithm = envAlgorithm != null ? envAlgorithm : prop.getProperty("algorithm"); + } + + public void setIndex(final TripleIndex index) { + try { + this.index = index; + domainWhiteLister = new DomainWhiteLister(index); + } catch (final IOException e) { + log.error("Could not set new index in Candidate Util due to DomainWhiteLister"); + e.printStackTrace(); + } + + } + + public void insertCandidatesIntoText(final DirectedSparseGraph graph, + final Document document, final double threshholdTrigram, final Boolean heuristicExpansionOn) + throws IOException { + final NamedEntitiesInText namedEntities = document.getNamedEntitiesInText(); + final String text = document.DocumentText().getText(); + final HashMap nodes = new HashMap(); + + // used for heuristic label expansion start with longest Named Entities + Collections.sort(namedEntities.getNamedEntities(), new NamedEntityLengthComparator()); + Collections.reverse(namedEntities.getNamedEntities()); + String entities = ""; + for (final NamedEntityInText namedEntity : namedEntities) { + entities = entities.concat(" ".concat(namedEntity.getLabel())); + } + log.info("entities" + entities); + final HashSet heuristicExpansion = new HashSet(); + for (final NamedEntityInText entity : namedEntities) { + String label = text.substring(entity.getStartPos(), entity.getEndPos()); + + log.info("\tLabel: " + label); + final long start = System.currentTimeMillis(); + + if (heuristicExpansionOn) { + label = heuristicExpansion(heuristicExpansion, label); + } + checkLabelCandidates(graph, threshholdTrigram, nodes, entity, label, false, entities); + + log.info("\tGraph size: " + graph.getVertexCount() + " took: " + + (System.currentTimeMillis() - start) + " ms"); + } + } + + private String heuristicExpansion(final HashSet heuristicExpansion, String label) { + String tmp = label; + boolean expansion = false; + for (final String key : heuristicExpansion) { + if (key.contains(label)) { + // take the shortest possible expansion + if ((tmp.length() > key.length()) && (tmp != label)) { + tmp = key; + expansion = true; + log.debug("Heuristic expansion: " + label + "-->" + key); + } + if ((tmp.length() < key.length()) && (tmp == label)) { + tmp = key; + expansion = true; + log.debug("Heuristic expansion: " + label + "-->" + key); + } + } + } + label = tmp; + if (!expansion) { + heuristicExpansion.add(label); + } + return label; + } + + public void addNodeToGraph(final DirectedSparseGraph graph, + final HashMap nodes, final NamedEntityInText entity, final Triple c, + final String candidateURL) throws IOException { + final Node currentNode = new Node(candidateURL, 0, 0, algorithm); + log.debug("CandidateURL: " + candidateURL); + // candidates are connected to a specific label in the text via their + // start position + if (!graph.addVertex(currentNode)) { + final int st = entity.getStartPos(); + if (nodes.get(candidateURL) != null) { + nodes.get(candidateURL).addId(st); + } else { + log.error("This vertex couldn't be added because of an bug in Jung: " + candidateURL); + } + } else { + currentNode.addId(entity.getStartPos()); + nodes.put(candidateURL, currentNode); + } + } + + private void checkLabelCandidates(final DirectedSparseGraph graph, + final double threshholdTrigram, final HashMap nodes, + final NamedEntityInText entity, String label, final boolean searchInSurfaceForms, + final String entities) throws IOException { + List candidates = new ArrayList(); + List acronymCandidatesTemp = new ArrayList(); + List acronymCandidatesTemp2 = new ArrayList(); + List candidatesContext = new ArrayList(); + final List candidatesContextbyLabel = new ArrayList(); + final List linkedsbyContext = new ArrayList(); + int countFinalCandidates = 0; + + final PreprocessingNLP nlp = new PreprocessingNLP(); + // Label treatment + label = corporationAffixCleaner.cleanLabelsfromCorporationIdentifier(label); + log.info("Label:" + label); + label = nlp.Preprocessing(label); + // label treatment finished -> + // searchByAcronym + if (acronym == true) { + if (label.equals(label.toUpperCase()) && (label.length() <= 4)) { + acronymCandidatesTemp = searchbyAcronym(label, searchInSurfaceForms, entity.getType()); + for (final Triple triple : acronymCandidatesTemp) { + acronymCandidatesTemp2 = + searchAcronymByLabel(triple.getSubject(), searchInSurfaceForms, entity.getType()); + for (final Triple triple2 : acronymCandidatesTemp2) { + if (nGramDistance.getDistance(triple.getSubject(), + triple2.getObject()) > threshholdTrigram) { + // follow redirect + triple2.setSubject(redirect(triple2.getSubject())); + // iff it is a disambiguation resource, skip it + if (isDisambiguationResource(triple2.getSubject())) { + continue; + } + if (commonEntities == true) { + addNodeToGraph(graph, nodes, entity, triple2, triple2.getSubject()); + countFinalCandidates++; + } else { + if (domainWhiteLister.fitsIntoDomain(triple2.getSubject())) { + addNodeToGraph(graph, nodes, entity, triple2, triple2.getSubject()); + countFinalCandidates++; + } + } + } + } + acronymCandidatesTemp2.clear(); + } + log.info("\t\tnumber of candidates by acronym: " + countFinalCandidates); + } + } + // searchByAcronymFinished + + if (countFinalCandidates == 0) { + candidates = searchCandidatesByLabel(label, searchInSurfaceForms, "", popularity); + if (searchInSurfaceForms) { + log.info("\t\tnumber of candidates by SF label: " + candidates.size()); + } else { + log.info("\t\tnumber of candidates by main label: " + candidates.size()); + } + + if (candidates.size() == 0) { + log.info("\t\t\tNo candidates for: " + label); + if (label.endsWith("'s")) { + // removing plural s + label = label.substring(0, label.lastIndexOf("'s")); + candidates = searchCandidatesByLabel(label, searchInSurfaceForms, "", popularity); + log.info("\t\t\tEven not with expansion"); + } else if (label.endsWith("s")) { + // removing genitiv s + label = label.substring(0, label.lastIndexOf("s")); + candidates = searchCandidatesByLabel(label, searchInSurfaceForms, "", popularity); + log.info("\t\t\tEven not with expansion"); + } + } + // If the set of candidates is still empty, here we apply stemming + // technique + if (candidates.isEmpty()) { + final Stemming stemmer = new Stemming(); + final String temp = stemmer.stemming(label); + candidates = searchCandidatesByLabel(temp, searchInSurfaceForms, "", popularity); + log.info("\t\tnumber of all candidates by stemming: " + candidates.size()); + } + // Here starts the similarity by trigram + boolean added = false; + for (final Triple c : candidates) { + log.info("Candidate triple to check: " + c); + String candidateURL = c.getSubject(); + String surfaceForm = c.getObject(); + surfaceForm = nlp.Preprocessing(surfaceForm); + // rule of thumb: no year numbers in candidates + if (candidateURL.startsWith(nodeType)) { + // if it is a disambiguation resource, skip it + // trigram similarity + if (c.getPredicate().equals("http://www.w3.org/2000/01/rdf-schema#label")) { + if (nGramDistance.getDistance(surfaceForm, label) < 1.0) {// Here + // we + // set + // the + // similarity + // as + // maximum + // because + // rfds:label + // refers + // to + // the + // main + // reference + // of + // a + // given + // resource + continue; + } + } else if (!c.getPredicate().equals("http://www.w3.org/2000/01/rdf-schema#label")) { // Here + // the + // similarity + // is + // in + // accordance + // with + // the + // user's + // choice. + if (nGramDistance.getDistance(surfaceForm, label) < threshholdTrigram) { + continue; + } + } + // follow redirect + candidateURL = redirect(candidateURL); + if (isDisambiguationResource(candidateURL)) { + log.info("CandidateURL" + candidateURL); + continue; + } + if (commonEntities == true) { // Being able to get all kinds + // of resource not only + // Person, Organization, + // Location + addNodeToGraph(graph, nodes, entity, c, candidateURL); + added = true; + countFinalCandidates++; + } else { + if (domainWhiteLister.fitsIntoDomain(candidateURL)) { + addNodeToGraph(graph, nodes, entity, c, candidateURL); + added = true; + countFinalCandidates++; + } + } + } + } + // Looking by context starts here. + if (!added && !searchInSurfaceForms && (context == true)) { + log.info("searchByContext"); + candidatesContext = searchCandidatesByContext(entities, label); // looking + // for + // all + // entities + // together + log.info("\t\tnumber of candidates by context: " + candidatesContext.size()); + + // taking all possibles SF for each resource found. + if (candidatesContext != null) { + for (final Triple triple : candidatesContext) { + final String url = nodeType + triple.getPredicate(); + candidatesContextbyLabel.addAll(searchCandidatesByUrl(url, searchInSurfaceForms)); + } + } + // Here, we apply two filters for increasing the quality of + // possible candidates + for (final Triple c : candidatesContextbyLabel) { + log.debug("Candidate triple to check: " + c); + String candidateURL = c.getSubject(); + String cleanCandidateURL = candidateURL.replace(nodeType, ""); + cleanCandidateURL = nlp.Preprocessing(cleanCandidateURL); + if (candidateURL.startsWith(nodeType)) { + // trigram similarity over the URIS + if (nGramDistance.getDistance(cleanCandidateURL, label) < 0.3) { + continue; + } + // finding direct connections + for (final Triple temp : candidatesContext) { + final String candidateTemp = nodeType + temp.getPredicate(); + linkedsbyContext.addAll(searchbyConnections(candidateURL, candidateTemp)); + } + // Only resources which have connections with others are + // treated as possible candidates. + if (linkedsbyContext.size() < 1) { + continue; + } + // follow redirect + candidateURL = redirect(candidateURL); + + // if it is a disambiguation resource, skip it + if (isDisambiguationResource(candidateURL)) { + continue; + } + // Enabling more types of entities as the previous step. + if (commonEntities == true) { + addNodeToGraph(graph, nodes, entity, c, candidateURL); + added = true; + countFinalCandidates++; + } else { + if (domainWhiteLister.fitsIntoDomain(candidateURL)) { + addNodeToGraph(graph, nodes, entity, c, candidateURL); + added = true; + countFinalCandidates++; + } + } + } + linkedsbyContext.clear(); + } + } + // Looking for the given label among the set of surface forms. + if (!added && !searchInSurfaceForms) { + log.info("Search using SF from disambiguation, redirects and from anchors web pages"); + checkLabelCandidates(graph, threshholdTrigram, nodes, entity, label, true, entities); + } + + } + log.info("\t\tnumber of final candidates " + countFinalCandidates); + } + + private ArrayList searchCandidatesByLabel(final String label, + final boolean searchInSurfaceFormsToo, final String type, final boolean popularity) { + final ArrayList tmp = new ArrayList(); + final ArrayList tmp2 = new ArrayList(); + final ArrayList finalTmp = new ArrayList(); + ArrayList candidatesScore = new ArrayList(); + + if (popularity) { // Frequency of entities. + tmp.addAll(index.search(null, "http://www.w3.org/2000/01/rdf-schema#label", label, 500)); + if (searchInSurfaceFormsToo) { + tmp.clear(); + tmp.addAll(index.search(null, "http://www.w3.org/2004/02/skos/core#altLabel", label, 500)); + } + + for (final Triple c : tmp) { + tmp2.add(new Triple(c.getSubject(), c.getPredicate(), c.getObject())); + final String uri = c.getSubject().replace(nodeType, ""); + candidatesScore = searchCandidatesByScore(uri); + c.setPredicate(c.getObject()); + if (candidatesScore.isEmpty()) { + c.setObject("1"); + } else { + c.setObject(candidatesScore.get(0).getObject()); + } + } + + Collections.sort(tmp); + + if (tmp.size() < 100) { + for (final Triple triple : tmp.subList(0, tmp.size())) { + for (final Triple triple2 : tmp2) { + if (triple.getSubject().equals(triple2.getSubject()) + && triple.getPredicate().equals(triple2.getObject())) { + finalTmp.add(triple2); + continue; + } + + } + } + + } else if (tmp.size() >= 100) { + for (final Triple triple : tmp.subList(0, 100)) { + for (final Triple triple2 : tmp2) { + if (triple.getSubject().equals(triple2.getSubject()) + && triple.getPredicate().equals(triple2.getObject())) { + finalTmp.add(triple2); + continue; + } + + } + } + + } + return finalTmp; + } else { + tmp.addAll(index.search(null, "http://www.w3.org/2000/01/rdf-schema#label", label)); + if (searchInSurfaceFormsToo) { + tmp.clear(); + tmp.addAll(index.search(null, "http://www.w3.org/2004/02/skos/core#altLabel", label)); + } + return tmp; + } + } + + public ArrayList searchbyAcronym(final String label, + final boolean searchInSurfaceFormsToo, final String type) { + final ArrayList tmp = new ArrayList(); + tmp.addAll(index.search(null, "http://dbpedia.org/property/acronym", label, 100)); + return tmp; + } + + public ArrayList searchAcronymByLabel(final String label, + final boolean searchInSurfaceFormsToo, final String type) { + final ArrayList tmp = new ArrayList(); + tmp.addAll(index.search(null, "http://www.w3.org/2000/01/rdf-schema#label", label, 100)); + return tmp; + } + + ArrayList searchCandidatesByContext(final String entities, final String label) { + final ArrayList tmp = new ArrayList(); + tmp.addAll(index2.search(entities, label, null, 100)); + + return tmp; + } + + ArrayList searchCandidatesByScore(final String label) { + final ArrayList tmp = new ArrayList(); + tmp.addAll(index2.search(null, label, null)); + + return tmp; + } + + ArrayList searchbyConnections(final String uri, final String uri2) { + final ArrayList tmp = new ArrayList(); + tmp.addAll(index.search(uri, null, uri2)); + + return tmp; + } + + ArrayList searchCandidatesByUrl(final String url, final boolean searchInSurfaceFormsToo) { + final ArrayList tmp = new ArrayList(); + final ArrayList tmp2 = new ArrayList(); + final ArrayList finalTmp = new ArrayList(); + ArrayList candidatesScore = new ArrayList(); + + if (popularity) { + tmp.addAll(index.search(url, "http://www.w3.org/2000/01/rdf-schema#label", null, 500)); + + for (final Triple c : tmp) { + tmp2.add(new Triple(c.getSubject(), c.getPredicate(), c.getObject())); + final String uri = c.getSubject().replace(nodeType, ""); + candidatesScore = searchCandidatesByScore(uri); + c.setPredicate(c.getObject()); + if (candidatesScore.isEmpty()) { + c.setObject("1"); + } else { + c.setObject(candidatesScore.get(0).getObject()); + } + } + + Collections.sort(tmp); + + if (tmp.size() < 100) { + for (final Triple triple : tmp.subList(0, tmp.size())) { + for (final Triple triple2 : tmp2) { + if (triple.getSubject().equals(triple2.getSubject()) + && triple.getPredicate().equals(triple2.getObject())) { + finalTmp.add(triple2); + continue; + } + + } + } + + } else if (tmp.size() >= 100) { + for (final Triple triple : tmp.subList(0, 100)) { + for (final Triple triple2 : tmp2) { + if (triple.getSubject().equals(triple2.getSubject()) + && triple.getPredicate().equals(triple2.getObject())) { + finalTmp.add(triple2); + continue; + } + + } + } + + } + return finalTmp; + } else { + tmp.addAll(index.search(url, "http://www.w3.org/2000/01/rdf-schema#label", null)); + return tmp; + } + } + + private boolean isDisambiguationResource(final String candidateURL) { + final List tmp = + index.search(candidateURL, "http://dbpedia.org/ontology/wikiPageDisambiguates", null); + if (tmp.isEmpty()) { + return false; + } else { + return true; + } + } + + private String redirect(final String candidateURL) { + if (candidateURL == null) { + return candidateURL; + } + final List redirect = + index.search(candidateURL, "http://dbpedia.org/ontology/wikiPageRedirects", null); + if (redirect.size() == 1) { + return redirect.get(0).getObject(); + } else if (redirect.size() > 1) { + log.error("Several redirects detected for :" + candidateURL); + return candidateURL; + } else { + return candidateURL; + } + } + + public void close() throws IOException { + index.close(); + } + + public TripleIndex getIndex() { + return index; + } } diff --git a/src/main/java/org/aksw/agdistis/algorithm/NEDAlgo_HITS.java b/src/main/java/org/aksw/agdistis/algorithm/NEDAlgo_HITS.java index 9910e5a..4b3f284 100755 --- a/src/main/java/org/aksw/agdistis/algorithm/NEDAlgo_HITS.java +++ b/src/main/java/org/aksw/agdistis/algorithm/NEDAlgo_HITS.java @@ -24,149 +24,156 @@ public class NEDAlgo_HITS { - private Logger log = LoggerFactory.getLogger(NEDAlgo_HITS.class); - private String edgeType; - private String nodeType; - private CandidateUtil cu; - private TripleIndex index; - // needed for the experiment about which properties increase accuracy - private double threshholdTrigram; - private int maxDepth; - private Boolean heuristicExpansionOn; - private String algorithm; - - public NEDAlgo_HITS() throws IOException { - Properties prop = new Properties(); - InputStream input = NEDAlgo_HITS.class.getResourceAsStream("/config/agdistis.properties"); - prop.load(input); - - String envNodeType = System.getenv("AGDISTIS_NODE_TYPE"); - String nodeType = envNodeType != null ? envNodeType : prop.getProperty("nodeType"); - String envEdgeType = System.getenv("AGDISTIS_EDGE_TYPE"); - String edgeType = envEdgeType != null ? envEdgeType : prop.getProperty("edgeType"); - String envThresholdTrigram = System.getenv("AGDISTIS_THRESHHOLD_TRIGRAM"); - double threshholdTrigram = Double.valueOf(envThresholdTrigram != null ? envThresholdTrigram : prop.getProperty("threshholdTrigram")); - String envMaxDepth = System.getenv("AGDISTIS_MAX_DEPTH"); - int maxDepth = Integer.valueOf(envMaxDepth != null ? envMaxDepth : prop.getProperty("maxDepth")); - String envHeuristicExpansion = System.getenv("AGDISTIS_HEURISTIC_EXPANSION_ON"); - this.heuristicExpansionOn = Boolean.valueOf(envHeuristicExpansion != null ? envHeuristicExpansion : prop.getProperty("heuristicExpansionOn")); - String envAlgorithm = System.getenv("AGDISTIS_ALGORITHM"); - this.algorithm = envAlgorithm != null ? envAlgorithm : prop.getProperty("algorithm"); - this.nodeType = nodeType; - this.edgeType = edgeType; - this.threshholdTrigram = threshholdTrigram; - this.maxDepth = maxDepth; - this.cu = new CandidateUtil(); - this.index = cu.getIndex(); - } - - public void run(Document document, Map> candidatesPerNE) { - try { - NamedEntitiesInText namedEntities = document.getNamedEntitiesInText(); - DirectedSparseGraph graph = new DirectedSparseGraph(); - - // 0) insert candidates into Text - log.debug("\tinsert candidates"); - cu.insertCandidatesIntoText(graph, document, threshholdTrigram, heuristicExpansionOn); - - // 1) let spread activation/ breadth first search run - log.info("\tGraph size before BFS: " + graph.getVertexCount()); - BreadthFirstSearch bfs = new BreadthFirstSearch(index, algorithm); - bfs.run(maxDepth, graph, edgeType, nodeType); - log.info("\tGraph size after BFS: " + graph.getVertexCount()); - - if (algorithm.equals("hits")) { - // 2.1) let HITS run - log.info("\trun HITS"); - HITS h = new HITS(); - h.runHits(graph, 20); - } else if (algorithm.equals("pagerank")) { - // 2.2) let Pagerank run - log.info("\trun PageRank"); - PageRank pr = new PageRank(); - pr.runPr(graph, 50, 0.1); - } - - // 3) store the candidate with the highest hub, highest authority - // ratio - // manipulate which value to use directly in node.compareTo - log.debug("\torder results"); - ArrayList orderedList = new ArrayList(); - orderedList.addAll(graph.getVertices()); - Collections.sort(orderedList); - for (NamedEntityInText entity : namedEntities) { - for (int i = 0; i < orderedList.size(); i++) { - Node m = orderedList.get(i); - // there can be one node (candidate) for two labels - if (m.containsId(entity.getStartPos())) { - entity.setNamedEntity(m.getCandidateURI()); - break; - } - - } - } - // To get all candidates along with their scores - if (candidatesPerNE != null) { - List listCandidates = new ArrayList<>(); - for (NamedEntityInText entity : namedEntities) { - for (int i = 0; i < orderedList.size(); i++) { - Node m = orderedList.get(i); - - // there can be one node (candidate) for two labels - if (m.containsId(entity.getStartPos())) { - - CandidatesScore candidates = new CandidatesScore(); - candidates.setStart(entity.getStartPos()); - candidates.setUri(m.getCandidateURI()); - candidates.setScore(m.getAuthorityWeight()); - listCandidates.add(candidates); - } - - } - candidatesPerNE.put(entity, listCandidates); - } - } - - } catch (Exception e) { - log.error("AGDISTIS cannot be run on this document.", e); - } - } - - public void close() throws IOException { - cu.close(); - } - - public void setThreshholdTrigram(double threshholdTrigram) { - this.threshholdTrigram = threshholdTrigram; - } - - public void setMaxDepth(int maxDepth) { - this.maxDepth = maxDepth; - } - - public void setHeuristicExpansionOn(Boolean value) { - this.heuristicExpansionOn = value; - } - public String getEdgeType() { - return edgeType; - } - - public void setEdgeType(String edgeType) { - this.edgeType = edgeType; - } - - public String getNodeType() { - return nodeType; - } - - public void setNodeType(String nodeType) { - this.nodeType = nodeType; - this.cu.setNodeType(nodeType); - } - - public void setIndex(TripleIndex index) { - this.index = index; - this.cu.setIndex(index); - } + private final Logger log = LoggerFactory.getLogger(NEDAlgo_HITS.class); + private String edgeType; + private String nodeType; + private final CandidateUtil cu; + private TripleIndex index; + // needed for the experiment about which properties increase accuracy + private double threshholdTrigram; + private int maxDepth; + private Boolean heuristicExpansionOn; + private final String algorithm; + + public NEDAlgo_HITS(final String file) throws IOException { + final ClassLoader loader = Thread.currentThread().getContextClassLoader(); + final InputStream is = loader.getResourceAsStream(file); + + final Properties prop = new Properties(); + prop.load(is); + + final String envNodeType = System.getenv("AGDISTIS_NODE_TYPE"); + final String nodeType = envNodeType != null ? envNodeType : prop.getProperty("nodeType"); + final String envEdgeType = System.getenv("AGDISTIS_EDGE_TYPE"); + final String edgeType = envEdgeType != null ? envEdgeType : prop.getProperty("edgeType"); + final String envThresholdTrigram = System.getenv("AGDISTIS_THRESHHOLD_TRIGRAM"); + final double threshholdTrigram = Double.valueOf( + envThresholdTrigram != null ? envThresholdTrigram : prop.getProperty("threshholdTrigram")); + final String envMaxDepth = System.getenv("AGDISTIS_MAX_DEPTH"); + final int maxDepth = + Integer.valueOf(envMaxDepth != null ? envMaxDepth : prop.getProperty("maxDepth")); + final String envHeuristicExpansion = System.getenv("AGDISTIS_HEURISTIC_EXPANSION_ON"); + heuristicExpansionOn = Boolean.valueOf(envHeuristicExpansion != null ? envHeuristicExpansion + : prop.getProperty("heuristicExpansionOn")); + final String envAlgorithm = System.getenv("AGDISTIS_ALGORITHM"); + algorithm = envAlgorithm != null ? envAlgorithm : prop.getProperty("algorithm"); + this.nodeType = nodeType; + this.edgeType = edgeType; + this.threshholdTrigram = threshholdTrigram; + this.maxDepth = maxDepth; + cu = new CandidateUtil(file); + index = cu.getIndex(); + } + + public void run(final Document document, + final Map> candidatesPerNE) { + try { + final NamedEntitiesInText namedEntities = document.getNamedEntitiesInText(); + final DirectedSparseGraph graph = new DirectedSparseGraph(); + + // 0) insert candidates into Text + log.debug("\tinsert candidates"); + cu.insertCandidatesIntoText(graph, document, threshholdTrigram, heuristicExpansionOn); + + // 1) let spread activation/ breadth first search run + log.info("\tGraph size before BFS: " + graph.getVertexCount()); + final BreadthFirstSearch bfs = new BreadthFirstSearch(index, algorithm); + bfs.run(maxDepth, graph, edgeType, nodeType); + log.info("\tGraph size after BFS: " + graph.getVertexCount()); + + if (algorithm.equals("hits")) { + // 2.1) let HITS run + log.info("\trun HITS"); + final HITS h = new HITS(); + h.runHits(graph, 20); + } else if (algorithm.equals("pagerank")) { + // 2.2) let Pagerank run + log.info("\trun PageRank"); + final PageRank pr = new PageRank(); + pr.runPr(graph, 50, 0.1); + } + + // 3) store the candidate with the highest hub, highest authority + // ratio + // manipulate which value to use directly in node.compareTo + log.debug("\torder results"); + final ArrayList orderedList = new ArrayList(); + orderedList.addAll(graph.getVertices()); + Collections.sort(orderedList); + for (final NamedEntityInText entity : namedEntities) { + for (int i = 0; i < orderedList.size(); i++) { + final Node m = orderedList.get(i); + // there can be one node (candidate) for two labels + if (m.containsId(entity.getStartPos())) { + entity.setNamedEntity(m.getCandidateURI()); + break; + } + + } + } + // To get all candidates along with their scores + if (candidatesPerNE != null) { + final List listCandidates = new ArrayList<>(); + for (final NamedEntityInText entity : namedEntities) { + for (int i = 0; i < orderedList.size(); i++) { + final Node m = orderedList.get(i); + + // there can be one node (candidate) for two labels + if (m.containsId(entity.getStartPos())) { + + final CandidatesScore candidates = new CandidatesScore(); + candidates.setStart(entity.getStartPos()); + candidates.setUri(m.getCandidateURI()); + candidates.setScore(m.getAuthorityWeight()); + listCandidates.add(candidates); + } + + } + candidatesPerNE.put(entity, listCandidates); + } + } + + } catch (final Exception e) { + log.error("AGDISTIS cannot be run on this document.", e); + } + } + + public void close() throws IOException { + cu.close(); + } + + public void setThreshholdTrigram(final double threshholdTrigram) { + this.threshholdTrigram = threshholdTrigram; + } + + public void setMaxDepth(final int maxDepth) { + this.maxDepth = maxDepth; + } + + public void setHeuristicExpansionOn(final Boolean value) { + heuristicExpansionOn = value; + } + + public String getEdgeType() { + return edgeType; + } + + public void setEdgeType(final String edgeType) { + this.edgeType = edgeType; + } + + public String getNodeType() { + return nodeType; + } + + public void setNodeType(final String nodeType) { + this.nodeType = nodeType; + cu.setNodeType(nodeType); + } + + public void setIndex(final TripleIndex index) { + this.index = index; + cu.setIndex(index); + } } diff --git a/src/main/java/org/aksw/agdistis/util/TripleIndex.java b/src/main/java/org/aksw/agdistis/util/TripleIndex.java index f18cbe4..8aae485 100755 --- a/src/main/java/org/aksw/agdistis/util/TripleIndex.java +++ b/src/main/java/org/aksw/agdistis/util/TripleIndex.java @@ -34,158 +34,162 @@ public class TripleIndex { - private static final Version LUCENE44 = Version.LUCENE_44; - - private org.slf4j.Logger log = LoggerFactory.getLogger(TripleIndex.class); - - public static final String FIELD_NAME_SUBJECT = "subject"; - public static final String FIELD_NAME_PREDICATE = "predicate"; - public static final String FIELD_NAME_OBJECT_URI = "object_uri"; - public static final String FIELD_NAME_OBJECT_LITERAL = "object_literal"; - // public static final String FIELD_URI_COUNT = "uri_counts"; - public static final String FIELD_FREQ = "freq"; - - private int defaultMaxNumberOfDocsRetrievedFromIndex = 100; - - private Directory directory; - private IndexSearcher isearcher; - private DirectoryReader ireader; - private UrlValidator urlValidator; - private Cache> cache; - StringUtils isInt = new StringUtils(); - - public TripleIndex() throws IOException { - Properties prop = new Properties(); - InputStream input = TripleIndex.class.getResourceAsStream("/config/agdistis.properties"); - prop.load(input); - - String envIndex = System.getenv("AGDISTIS_INDEX"); - String index = envIndex != null ? envIndex : prop.getProperty("index"); - log.info("The index will be here: " + index); - - directory = new MMapDirectory(new File(index)); - ireader = DirectoryReader.open(directory); - isearcher = new IndexSearcher(ireader); - this.urlValidator = new UrlValidator(); - - cache = CacheBuilder.newBuilder().maximumSize(50000).build(); - } - - public void setIndex(String index) throws IOException { - directory = new MMapDirectory(new File(index)); - ireader = DirectoryReader.open(directory); - isearcher = new IndexSearcher(ireader); - } - - public List search(String subject, String predicate, String object) { - return search(subject, predicate, object, defaultMaxNumberOfDocsRetrievedFromIndex); - } - - public List search(String subject, String predicate, String object, int maxNumberOfResults) { - System.out.println(predicate +" -> "+object + " : " ); - BooleanQuery bq = new BooleanQuery(); - List triples = new ArrayList(); - - try { - if (subject != null && subject.equals("http://aksw.org/notInWiki")) { - log.error( - "A subject 'http://aksw.org/notInWiki' is searched in the index. That is strange and should not happen"); - } - if (subject != null) { - Query tq = new TermQuery(new Term(FIELD_NAME_SUBJECT, subject)); - bq.add(tq, BooleanClause.Occur.MUST); - } - if (predicate != null) { - Query tq = new TermQuery(new Term(FIELD_NAME_PREDICATE, predicate)); - bq.add(tq, BooleanClause.Occur.MUST); - } - - // if (object != null) { - // Query tq = new TermQuery(new Term(FIELD_NAME_OBJECT_LITERAL, - // object)); - // bq.add(tq, BooleanClause.Occur.MUST); - // } - if (object != null && object.length() > 0) { - Query q = null; - if (urlValidator.isValid(object)) { - - q = new TermQuery(new Term(FIELD_NAME_OBJECT_URI, object)); - bq.add(q, BooleanClause.Occur.MUST); - - } else if (StringUtils.isNumeric(object)) { - // System.out.println("here numeric"); - int tempInt = Integer.parseInt(object); - BytesRef bytes = new BytesRef(NumericUtils.BUF_SIZE_INT); - NumericUtils.intToPrefixCoded(tempInt, 0, bytes); - q = new TermQuery(new Term(FIELD_NAME_OBJECT_LITERAL, bytes.utf8ToString())); - bq.add(q, BooleanClause.Occur.MUST); - - } - // for index from 2014 comment the "else if" below. - // else if (!object.contains(" ")) { - // - // // System.out.println("here regex"); - // KeywordAnalyzer kanalyzer = new KeywordAnalyzer(); - // q = new QueryParser(LUCENE44, FIELD_NAME_OBJECT_LITERAL, - // kanalyzer).parse(object); - // - // bq.add(q, BooleanClause.Occur.MUST); - // } - else { - Analyzer analyzer = new LiteralAnalyzer(LUCENE44); - QueryParser parser = new QueryParser(LUCENE44, FIELD_NAME_OBJECT_LITERAL, analyzer); - parser.setDefaultOperator(QueryParser.Operator.AND); - q = parser.parse(QueryParserBase.escape(object)); - bq.add(q, BooleanClause.Occur.MUST); - } - // bq.add(q, BooleanClause.Occur.MUST); - } - - // use the cache - // if (null == (triples = cache.getIfPresent(bq))) { - triples = getFromIndex(maxNumberOfResults, bq); - cache.put(bq, triples); - // } - - } catch (Exception e) { - log.error(e.getLocalizedMessage() + " -> " + subject); - e.printStackTrace(); - } - return triples; - } - - private List getFromIndex(int maxNumberOfResults, BooleanQuery bq) throws IOException { - log.debug("\t start asking index..."); - TopScoreDocCollector collector = TopScoreDocCollector.create(maxNumberOfResults, true); - // Similarity BM25Similarity = new BM25Similarity(); - // isearcher.setSimilarity(BM25Similarity); - isearcher.search(bq, collector); - ScoreDoc[] hits = collector.topDocs().scoreDocs; - - List triples = new ArrayList(); - String s, p, o; - for (int i = 0; i < hits.length; i++) { - Document hitDoc = isearcher.doc(hits[i].doc); - s = hitDoc.get(FIELD_NAME_SUBJECT); - p = hitDoc.get(FIELD_NAME_PREDICATE); - o = hitDoc.get(FIELD_NAME_OBJECT_URI); - if (o == null) { - o = hitDoc.get(FIELD_NAME_OBJECT_LITERAL); - } - Triple triple = new Triple(s, p, o); - triples.add(triple); - } - log.debug("\t finished asking index..."); - return triples; - } - - public void close() throws IOException { - ireader.close(); - directory.close(); - } - - public DirectoryReader getIreader() { - return ireader; - } + private static final Version LUCENE44 = Version.LUCENE_44; + + private final org.slf4j.Logger log = LoggerFactory.getLogger(TripleIndex.class); + + public static final String FIELD_NAME_SUBJECT = "subject"; + public static final String FIELD_NAME_PREDICATE = "predicate"; + public static final String FIELD_NAME_OBJECT_URI = "object_uri"; + public static final String FIELD_NAME_OBJECT_LITERAL = "object_literal"; + // public static final String FIELD_URI_COUNT = "uri_counts"; + public static final String FIELD_FREQ = "freq"; + + private final int defaultMaxNumberOfDocsRetrievedFromIndex = 100; + + private Directory directory; + private IndexSearcher isearcher; + private DirectoryReader ireader; + private final UrlValidator urlValidator; + private final Cache> cache; + StringUtils isInt = new StringUtils(); + + public TripleIndex(final String file) throws IOException { + final ClassLoader loader = Thread.currentThread().getContextClassLoader(); + final InputStream is = loader.getResourceAsStream(file); + + final Properties prop = new Properties(); + prop.load(is); + + final String envIndex = System.getenv("AGDISTIS_INDEX"); + final String index = envIndex != null ? envIndex : prop.getProperty("index"); + log.info("The index will be here: " + index); + + directory = new MMapDirectory(new File(index)); + ireader = DirectoryReader.open(directory); + isearcher = new IndexSearcher(ireader); + urlValidator = new UrlValidator(); + + cache = CacheBuilder.newBuilder().maximumSize(50000).build(); + } + + public void setIndex(final String index) throws IOException { + directory = new MMapDirectory(new File(index)); + ireader = DirectoryReader.open(directory); + isearcher = new IndexSearcher(ireader); + } + + public List search(final String subject, final String predicate, final String object) { + return search(subject, predicate, object, defaultMaxNumberOfDocsRetrievedFromIndex); + } + + public List search(final String subject, final String predicate, final String object, + final int maxNumberOfResults) { + System.out.println(predicate + " -> " + object + " : "); + final BooleanQuery bq = new BooleanQuery(); + List triples = new ArrayList(); + + try { + if ((subject != null) && subject.equals("http://aksw.org/notInWiki")) { + log.error( + "A subject 'http://aksw.org/notInWiki' is searched in the index. That is strange and should not happen"); + } + if (subject != null) { + final Query tq = new TermQuery(new Term(FIELD_NAME_SUBJECT, subject)); + bq.add(tq, BooleanClause.Occur.MUST); + } + if (predicate != null) { + final Query tq = new TermQuery(new Term(FIELD_NAME_PREDICATE, predicate)); + bq.add(tq, BooleanClause.Occur.MUST); + } + + // if (object != null) { + // Query tq = new TermQuery(new Term(FIELD_NAME_OBJECT_LITERAL, + // object)); + // bq.add(tq, BooleanClause.Occur.MUST); + // } + if ((object != null) && (object.length() > 0)) { + Query q = null; + if (urlValidator.isValid(object)) { + + q = new TermQuery(new Term(FIELD_NAME_OBJECT_URI, object)); + bq.add(q, BooleanClause.Occur.MUST); + + } else if (StringUtils.isNumeric(object)) { + // System.out.println("here numeric"); + final int tempInt = Integer.parseInt(object); + final BytesRef bytes = new BytesRef(NumericUtils.BUF_SIZE_INT); + NumericUtils.intToPrefixCoded(tempInt, 0, bytes); + q = new TermQuery(new Term(FIELD_NAME_OBJECT_LITERAL, bytes.utf8ToString())); + bq.add(q, BooleanClause.Occur.MUST); + + } + // for index from 2014 comment the "else if" below. + // else if (!object.contains(" ")) { + // + // // System.out.println("here regex"); + // KeywordAnalyzer kanalyzer = new KeywordAnalyzer(); + // q = new QueryParser(LUCENE44, FIELD_NAME_OBJECT_LITERAL, + // kanalyzer).parse(object); + // + // bq.add(q, BooleanClause.Occur.MUST); + // } + else { + final Analyzer analyzer = new LiteralAnalyzer(LUCENE44); + final QueryParser parser = new QueryParser(LUCENE44, FIELD_NAME_OBJECT_LITERAL, analyzer); + parser.setDefaultOperator(QueryParser.Operator.AND); + q = parser.parse(QueryParserBase.escape(object)); + bq.add(q, BooleanClause.Occur.MUST); + } + // bq.add(q, BooleanClause.Occur.MUST); + } + + // use the cache + // if (null == (triples = cache.getIfPresent(bq))) { + triples = getFromIndex(maxNumberOfResults, bq); + cache.put(bq, triples); + // } + + } catch (final Exception e) { + log.error(e.getLocalizedMessage() + " -> " + subject); + e.printStackTrace(); + } + return triples; + } + + private List getFromIndex(final int maxNumberOfResults, final BooleanQuery bq) + throws IOException { + log.debug("\t start asking index..."); + final TopScoreDocCollector collector = TopScoreDocCollector.create(maxNumberOfResults, true); + // Similarity BM25Similarity = new BM25Similarity(); + // isearcher.setSimilarity(BM25Similarity); + isearcher.search(bq, collector); + final ScoreDoc[] hits = collector.topDocs().scoreDocs; + + final List triples = new ArrayList(); + String s, p, o; + for (int i = 0; i < hits.length; i++) { + final Document hitDoc = isearcher.doc(hits[i].doc); + s = hitDoc.get(FIELD_NAME_SUBJECT); + p = hitDoc.get(FIELD_NAME_PREDICATE); + o = hitDoc.get(FIELD_NAME_OBJECT_URI); + if (o == null) { + o = hitDoc.get(FIELD_NAME_OBJECT_LITERAL); + } + final Triple triple = new Triple(s, p, o); + triples.add(triple); + } + log.debug("\t finished asking index..."); + return triples; + } + + public void close() throws IOException { + ireader.close(); + directory.close(); + } + + public DirectoryReader getIreader() { + return ireader; + } } diff --git a/src/main/java/org/aksw/agdistis/webapp/GetDisambiguation.java b/src/main/java/org/aksw/agdistis/webapp/GetDisambiguation.java index 0cef9f9..c09ddf7 100755 --- a/src/main/java/org/aksw/agdistis/webapp/GetDisambiguation.java +++ b/src/main/java/org/aksw/agdistis/webapp/GetDisambiguation.java @@ -6,7 +6,6 @@ import java.net.URLDecoder; import java.util.ArrayList; import java.util.HashMap; -import java.util.HashSet; import java.util.List; import java.util.Map; @@ -35,224 +34,231 @@ public class GetDisambiguation extends ServerResource { - private static Logger log = LoggerFactory.getLogger(GetDisambiguation.class); - private TurtleNIFDocumentParser parser = new TurtleNIFDocumentParser(); - private TurtleNIFDocumentCreator creator = new TurtleNIFDocumentCreator(); - private NIFParser nifParser = new NIFParser(); + private static Logger log = LoggerFactory.getLogger(GetDisambiguation.class); + private final TurtleNIFDocumentParser parser = new TurtleNIFDocumentParser(); + private final TurtleNIFDocumentCreator creator = new TurtleNIFDocumentCreator(); + private final NIFParser nifParser = new NIFParser(); - @Post - public String postText(Representation entity) throws IOException, Exception { - NEDAlgo_HITS agdistis = null; - try { - agdistis = new NEDAlgo_HITS(); - } catch (IOException e) { - log.error( - "Can not load index due to either wrong properties in agdistis.properties or missing index at location", - e); - System.exit(0); - } - log.info("Start working on Request for AGDISTIS"); - String result = ""; - String text = ""; - String type = ""; - InputStream input = entity.getStream(); - // here the inputStream is duplicated due to it can be read only once. - // Therefore, we do it for checking if the input is from gerbil or not. - byte[] byteArray = IOUtils.toByteArray(input); - InputStream input1 = new ByteArrayInputStream(byteArray); - InputStream input2 = new ByteArrayInputStream(byteArray); + @Post + public String postText(final Representation entity) throws IOException, Exception { + NEDAlgo_HITS agdistis = null; + try { + final String cfg = "config/agdistis.properties"; + agdistis = new NEDAlgo_HITS(cfg); + } catch (final IOException e) { + log.error( + "Can not load index due to either wrong properties in agdistis.properties or missing index at location", + e); + System.exit(0); + } + log.info("Start working on Request for AGDISTIS"); + String result = ""; + String text = ""; + String type = ""; + final InputStream input = entity.getStream(); + // here the inputStream is duplicated due to it can be read only once. + // Therefore, we do it for checking if the input is from gerbil or not. + final byte[] byteArray = IOUtils.toByteArray(input); + final InputStream input1 = new ByteArrayInputStream(byteArray); + final InputStream input2 = new ByteArrayInputStream(byteArray); - String string = IOUtils.toString(input1); - // Parse the given representation and retrieve data - Form form = new Form(string); - text = form.getFirstValue("text"); - type = form.getFirstValue("type"); - log.info("text: " + text); - log.info("type: " + type); + final String string = IOUtils.toString(input1); + // Parse the given representation and retrieve data + final Form form = new Form(string); + text = form.getFirstValue("text"); + type = form.getFirstValue("type"); + log.info("text: " + text); + log.info("type: " + type); - if (text == null) { - result = NIFGerbil(input2, agdistis); // This part is created to - // work - // along with GERBIL, because - // GERBIL only sends the NIF - // files without taking care of - // more than one parameter. So, - // GERBIL is not capable to send - // the nif in the text parameter - // making - // AGDISTIS?type=nif&text= not - // work. - return result; - } - if (type == null) { - type = "agdistis"; - } + if (text == null) { + result = NIFGerbil(input2, agdistis); // This part is created to + // work + // along with GERBIL, because + // GERBIL only sends the NIF + // files without taking care of + // more than one parameter. So, + // GERBIL is not capable to send + // the nif in the text parameter + // making + // AGDISTIS?type=nif&text= not + // work. + return result; + } + if (type == null) { + type = "agdistis"; + } - if (type.equals("agdistis")) { - return standardAG(text, agdistis); // This type is the standard - // and in case the user - // doesn't send the type - // parameter, it is - // considered as the main - // one(e.g - // AGDISTIS?type=agdistis&text=Barack - // Obama). + if (type.equals("agdistis")) { + return standardAG(text, agdistis); // This type is the standard + // and in case the user + // doesn't send the type + // parameter, it is + // considered as the main + // one(e.g + // AGDISTIS?type=agdistis&text=Barack + // Obama). - } else if (type.equals("nif")) { - return NIFType(text, agdistis); // This type is for AGDISTIS - // works beyond the GERBIL, this - // part is in case of user wants - // to check just a certain NIF - // file(e.g - // AGDISTIS?type=nif&text=@prefix....) + } else if (type.equals("nif")) { + return NIFType(text, agdistis); // This type is for AGDISTIS + // works beyond the GERBIL, this + // part is in case of user wants + // to check just a certain NIF + // file(e.g + // AGDISTIS?type=nif&text=@prefix....) - } else if (type.equals("candidates")) { - return candidateType(text, agdistis); // Here is to let us know - // about all candidates - // for each mention and - // its respective - // HITS/PageRank score. - } else { - return "ERROR"; - } - } + } else if (type.equals("candidates")) { + return candidateType(text, agdistis); // Here is to let us know + // about all candidates + // for each mention and + // its respective + // HITS/PageRank score. + } else { + return "ERROR"; + } + } - public static Document textToDocument(String preAnnotatedText) { - Document document = new Document(); - ArrayList list = new ArrayList(); - log.info("\tText: " + preAnnotatedText); - int startpos = 0, endpos = 0; - StringBuilder sb = new StringBuilder(); - startpos = preAnnotatedText.indexOf("", startpos); - while (startpos >= 0) { - sb.append(preAnnotatedText.substring(endpos, startpos)); - startpos += 8; - endpos = preAnnotatedText.indexOf("", startpos); - int newStartPos = sb.length(); - String entityLabel = preAnnotatedText.substring(startpos, endpos); - list.add(new NamedEntityInText(newStartPos, entityLabel.length(), entityLabel, "")); - sb.append(entityLabel); - endpos += 9; - startpos = preAnnotatedText.indexOf("", startpos); - } + public static Document textToDocument(final String preAnnotatedText) { + final Document document = new Document(); + final ArrayList list = new ArrayList(); + log.info("\tText: " + preAnnotatedText); + int startpos = 0, endpos = 0; + final StringBuilder sb = new StringBuilder(); + startpos = preAnnotatedText.indexOf("", startpos); + while (startpos >= 0) { + sb.append(preAnnotatedText.substring(endpos, startpos)); + startpos += 8; + endpos = preAnnotatedText.indexOf("", startpos); + final int newStartPos = sb.length(); + final String entityLabel = preAnnotatedText.substring(startpos, endpos); + list.add(new NamedEntityInText(newStartPos, entityLabel.length(), entityLabel, "")); + sb.append(entityLabel); + endpos += 9; + startpos = preAnnotatedText.indexOf("", startpos); + } - NamedEntitiesInText nes = new NamedEntitiesInText(list); - DocumentText text = new DocumentText(preAnnotatedText.replaceAll("", "").replaceAll("", "")); + final NamedEntitiesInText nes = new NamedEntitiesInText(list); + final DocumentText text = + new DocumentText(preAnnotatedText.replaceAll("", "").replaceAll("", "")); - document.addText(text); - document.addNamedEntitiesInText(nes); - return document; - } + document.addText(text); + document.addNamedEntitiesInText(nes); + return document; + } - public String NIFGerbil(InputStream input, NEDAlgo_HITS agdistis) throws IOException { - org.aksw.gerbil.transfer.nif.Document document; - String nifDocument = ""; - String textWithMentions = ""; - List annotations = new ArrayList<>(); - try { - document = parser.getDocumentFromNIFStream(input); - log.info("NIF file coming from GERBIL"); - textWithMentions = nifParser.createTextWithMentions(document.getText(), document.getMarkings(Span.class)); - Document d = textToDocument(textWithMentions); - agdistis.run(d, null); - for (NamedEntityInText namedEntity : d.getNamedEntitiesInText()) { - String disambiguatedURL = namedEntity.getNamedEntityUri(); + public String NIFGerbil(final InputStream input, final NEDAlgo_HITS agdistis) throws IOException { + org.aksw.gerbil.transfer.nif.Document document; + String nifDocument = ""; + String textWithMentions = ""; + final List annotations = new ArrayList<>(); + try { + document = parser.getDocumentFromNIFStream(input); + log.info("NIF file coming from GERBIL"); + textWithMentions = + nifParser.createTextWithMentions(document.getText(), document.getMarkings(Span.class)); + final Document d = textToDocument(textWithMentions); + agdistis.run(d, null); + for (final NamedEntityInText namedEntity : d.getNamedEntitiesInText()) { + final String disambiguatedURL = namedEntity.getNamedEntityUri(); - if (disambiguatedURL == null) { - annotations.add(new NamedEntity(namedEntity.getStartPos(), namedEntity.getLength(), URLDecoder - .decode("http://aksw.org/notInWiki/" + namedEntity.getSingleWordLabel(), "UTF-8"))); - } else { - annotations.add(new NamedEntity(namedEntity.getStartPos(), namedEntity.getLength(), - URLDecoder.decode(namedEntity.getNamedEntityUri(), "UTF-8"))); - } - } - document.setMarkings(new ArrayList(annotations)); - log.debug("Result: " + document.toString()); - nifDocument = creator.getDocumentAsNIFString(document); - log.debug(nifDocument); + if (disambiguatedURL == null) { + annotations.add( + new NamedEntity(namedEntity.getStartPos(), namedEntity.getLength(), URLDecoder.decode( + "http://aksw.org/notInWiki/" + namedEntity.getSingleWordLabel(), "UTF-8"))); + } else { + annotations.add(new NamedEntity(namedEntity.getStartPos(), namedEntity.getLength(), + URLDecoder.decode(namedEntity.getNamedEntityUri(), "UTF-8"))); + } + } + document.setMarkings(new ArrayList(annotations)); + log.debug("Result: " + document.toString()); + nifDocument = creator.getDocumentAsNIFString(document); + log.debug(nifDocument); - } catch (Exception e) { - log.error("Exception while reading request.", e); - return ""; - } + } catch (final Exception e) { + log.error("Exception while reading request.", e); + return ""; + } - return nifDocument; - } - @SuppressWarnings("unchecked") - public String standardAG(String text, NEDAlgo_HITS agdistis) { - JSONArray arr = new org.json.simple.JSONArray(); + return nifDocument; + } - Document d = textToDocument(text); - agdistis.run(d, null); + @SuppressWarnings("unchecked") + public String standardAG(final String text, final NEDAlgo_HITS agdistis) { + final JSONArray arr = new org.json.simple.JSONArray(); - for (NamedEntityInText namedEntity : d.getNamedEntitiesInText()) { - if(!namedEntity.getNamedEntityUri().contains("http")){ - namedEntity.setNamedEntity("http://aksw.org/notInWiki/" + namedEntity.getSingleWordLabel()); - } - JSONObject obj = new JSONObject(); - obj.put("namedEntity", namedEntity.getLabel()); - obj.put("start", namedEntity.getStartPos()); - obj.put("offset", namedEntity.getLength()); - obj.put("disambiguatedURL", namedEntity.getNamedEntityUri()); - arr.add(obj); - } - log.info("\t" + arr.toString()); - log.info("Finished Request"); - return arr.toString(); + final Document d = textToDocument(text); + agdistis.run(d, null); - } + for (final NamedEntityInText namedEntity : d.getNamedEntitiesInText()) { + if (!namedEntity.getNamedEntityUri().contains("http")) { + namedEntity.setNamedEntity("http://aksw.org/notInWiki/" + namedEntity.getSingleWordLabel()); + } + final JSONObject obj = new JSONObject(); + obj.put("namedEntity", namedEntity.getLabel()); + obj.put("start", namedEntity.getStartPos()); + obj.put("offset", namedEntity.getLength()); + obj.put("disambiguatedURL", namedEntity.getNamedEntityUri()); + arr.add(obj); + } + log.info("\t" + arr.toString()); + log.info("Finished Request"); + return arr.toString(); - public String NIFType(String text, NEDAlgo_HITS agdistis) throws IOException { - org.aksw.gerbil.transfer.nif.Document document = null; - String nifDocument = ""; - NIFParser nifParser = new NIFParser(); - String textWithMentions = ""; - List annotations = new ArrayList<>(); + } - try { - document = parser.getDocumentFromNIFString(text); - log.debug("Request: " + document.toString()); - textWithMentions = nifParser.createTextWithMentions(document.getText(), document.getMarkings(Span.class)); - Document d = textToDocument(textWithMentions); - agdistis.run(d, null); - for (NamedEntityInText namedEntity : d.getNamedEntitiesInText()) { - String disambiguatedURL = namedEntity.getNamedEntityUri(); + public String NIFType(final String text, final NEDAlgo_HITS agdistis) throws IOException { + org.aksw.gerbil.transfer.nif.Document document = null; + String nifDocument = ""; + final NIFParser nifParser = new NIFParser(); + String textWithMentions = ""; + final List annotations = new ArrayList<>(); - if (disambiguatedURL == null) { - annotations.add(new NamedEntity(namedEntity.getStartPos(), namedEntity.getLength(), URLDecoder - .decode("http://aksw.org/notInWiki/" + namedEntity.getSingleWordLabel(), "UTF-8"))); - } else { - annotations.add(new NamedEntity(namedEntity.getStartPos(), namedEntity.getLength(), - URLDecoder.decode(disambiguatedURL, "UTF-8"))); - } - } - document.setMarkings(new ArrayList(annotations)); - log.debug("Result: " + document.toString()); - nifDocument = creator.getDocumentAsNIFString(document); - } catch (Exception e) { - log.error("Exception while reading request.", e); - return ""; - } - return nifDocument; - } + try { + document = parser.getDocumentFromNIFString(text); + log.debug("Request: " + document.toString()); + textWithMentions = + nifParser.createTextWithMentions(document.getText(), document.getMarkings(Span.class)); + final Document d = textToDocument(textWithMentions); + agdistis.run(d, null); + for (final NamedEntityInText namedEntity : d.getNamedEntitiesInText()) { + final String disambiguatedURL = namedEntity.getNamedEntityUri(); - @SuppressWarnings("unchecked") - public String candidateType(String text, NEDAlgo_HITS agdistis) { - JSONArray arr = new org.json.simple.JSONArray(); - Document d = textToDocument(text); - Map> candidatesPerNE = new HashMap<>(); - agdistis.run(d, candidatesPerNE); - for (NamedEntityInText namedEntity : candidatesPerNE.keySet()) { - List candidates = candidatesPerNE.get(namedEntity); - JSONObject obj = new JSONObject(); - obj.put("namedEntity", namedEntity.getLabel()); - obj.put("Candidates", candidates.toString()); - arr.add(obj); - } + if (disambiguatedURL == null) { + annotations.add( + new NamedEntity(namedEntity.getStartPos(), namedEntity.getLength(), URLDecoder.decode( + "http://aksw.org/notInWiki/" + namedEntity.getSingleWordLabel(), "UTF-8"))); + } else { + annotations.add(new NamedEntity(namedEntity.getStartPos(), namedEntity.getLength(), + URLDecoder.decode(disambiguatedURL, "UTF-8"))); + } + } + document.setMarkings(new ArrayList(annotations)); + log.debug("Result: " + document.toString()); + nifDocument = creator.getDocumentAsNIFString(document); + } catch (final Exception e) { + log.error("Exception while reading request.", e); + return ""; + } + return nifDocument; + } - log.info("\t" + arr.toString()); - log.info("Finished Request"); - return arr.toString(); + @SuppressWarnings("unchecked") + public String candidateType(final String text, final NEDAlgo_HITS agdistis) { + final JSONArray arr = new org.json.simple.JSONArray(); + final Document d = textToDocument(text); + final Map> candidatesPerNE = new HashMap<>(); + agdistis.run(d, candidatesPerNE); + for (final NamedEntityInText namedEntity : candidatesPerNE.keySet()) { + final List candidates = candidatesPerNE.get(namedEntity); + final JSONObject obj = new JSONObject(); + obj.put("namedEntity", namedEntity.getLabel()); + obj.put("Candidates", candidates.toString()); + arr.add(obj); + } - } + log.info("\t" + arr.toString()); + log.info("Finished Request"); + return arr.toString(); + + } } diff --git a/src/test/java/AGDISTISTest.java b/src/test/java/AGDISTISTest.java index 22574bc..752e722 100755 --- a/src/test/java/AGDISTISTest.java +++ b/src/test/java/AGDISTISTest.java @@ -11,157 +11,159 @@ import org.aksw.agdistis.datatypes.Document; import org.aksw.agdistis.datatypes.NamedEntitiesInText; import org.aksw.agdistis.datatypes.NamedEntityInText; -import org.aksw.agdistis.util.TripleIndexContext; import org.aksw.agdistis.webapp.GetDisambiguation; import org.junit.Test; public class AGDISTISTest { - - @Test - public void testUmlaute() throws InterruptedException, IOException { - String taisho = "Emperor Taishō"; - String taishoURL = "http://dbpedia.org/resource/Emperor_Taishō"; - String japan = "Japan"; - String japanURL = "http://dbpedia.org/resource/Japan"; - - HashMap correct = new HashMap(); - correct.put(taisho, taishoURL); - correct.put(japan, japanURL); - String preAnnotatedText = "" + taisho + " was the 123rd Emperor of " + japan - + "."; - - NEDAlgo_HITS agdistis = new NEDAlgo_HITS(); - Document d = GetDisambiguation.textToDocument(preAnnotatedText); - agdistis.run(d, null); - - NamedEntitiesInText namedEntities = d.getNamedEntitiesInText(); - HashMap results = new HashMap(); - for (NamedEntityInText namedEntity : namedEntities) { - String disambiguatedURL = namedEntity.getNamedEntityUri(); - results.put(namedEntity, disambiguatedURL); - } - for (NamedEntityInText namedEntity : namedEntities) { - String disambiguatedURL = namedEntity.getNamedEntityUri(); - System.out.println(namedEntity.getLabel() + " -> " + disambiguatedURL); - assertTrue(correct.get(namedEntity.getLabel()).equals(disambiguatedURL)); - } - - } - - @Test - public void testMinimalExample() throws InterruptedException, IOException { - String obama = "Barack Obama"; - String obamaURL = "http://dbpedia.org/resource/Barack_Obama"; - String merkel = "Angela Merkel"; - String merkelURL = "http://dbpedia.org/resource/Angela_Merkel"; - String city = "Berlin"; - String cityURL = "http://dbpedia.org/resource/Berlin"; - - HashMap correct = new HashMap(); - correct.put(obama, obamaURL); - correct.put(merkel, merkelURL); - correct.put(city, cityURL); - - String preAnnotatedText = "" + obama + " visits " + merkel + " in " - + city + "."; - - NEDAlgo_HITS agdistis = new NEDAlgo_HITS(); - Document d = GetDisambiguation.textToDocument(preAnnotatedText); - agdistis.run(d, null); - - NamedEntitiesInText namedEntities = d.getNamedEntitiesInText(); - HashMap results = new HashMap(); - for (NamedEntityInText namedEntity : namedEntities) { - String disambiguatedURL = namedEntity.getNamedEntityUri(); - results.put(namedEntity, disambiguatedURL); - } - for (NamedEntityInText namedEntity : namedEntities) { - String disambiguatedURL = namedEntity.getNamedEntityUri(); - System.out.println(namedEntity.getLabel() + " -> " + disambiguatedURL); - assertTrue(correct.get(namedEntity.getLabel()).equals(disambiguatedURL)); - } - - } - - @Test - /** - * This test ensures that entities consiting only of punctations, which are - * later reduced to a string of length 0 by - * https://github.com/dice-group/AGDISTIS/blob/master/src/main/java/org/aksw/agdistis/util/Stemming.java#L91 - * are not causing exceptions - * - * @throws InterruptedException - * @throws IOException - */ - public void testinterpunctation() throws InterruptedException, IOException { - String question = "???"; - String questionURL = "???"; - - HashMap correct = new HashMap(); - correct.put(question, questionURL); - - String preAnnotatedText = "???."; - - NEDAlgo_HITS agdistis = new NEDAlgo_HITS(); - Document d = GetDisambiguation.textToDocument(preAnnotatedText); - agdistis.run(d, null); - - NamedEntitiesInText namedEntities = d.getNamedEntitiesInText(); - HashMap results = new HashMap(); - for (NamedEntityInText namedEntity : namedEntities) { - String disambiguatedURL = namedEntity.getNamedEntityUri(); - results.put(namedEntity, disambiguatedURL); - } - for (NamedEntityInText namedEntity : namedEntities) { - String disambiguatedURL = namedEntity.getNamedEntityUri(); - System.out.println(namedEntity.getLabel() + " -> " + disambiguatedURL); - assertTrue(correct.get(namedEntity.getLabel()).equals(disambiguatedURL)); - } - - } - - @Test - public void testContext() throws InterruptedException, IOException { - String angelina = "Angelina"; - String angelinaURL = "http://dbpedia.org/resource/Angelina_Jolie"; - String brad = "Brad"; - String bradURL = "http://dbpedia.org/resource/Brad_Pitt"; - String jon = "Jon"; - String jonURL = "http://dbpedia.org/resource/Jon_Voight"; - - // load properties to see if the context index exists - Properties prop = new Properties(); - InputStream input = CandidateUtil.class.getResourceAsStream("/config/agdistis.properties"); - prop.load(input); - String envContext = System.getenv("AGDISTIS_CONTEXT"); - Boolean context = Boolean.valueOf(envContext != null ? envContext : prop.getProperty("context")); - if (context == true) { // in case the index by context exist - - HashMap correct = new HashMap(); - correct.put(angelina, angelinaURL); - correct.put(jon, jonURL); - correct.put(brad, bradURL); - - String preAnnotatedText = "" + angelina + ", her father " + jon - + ", and her partner " + brad - + " never played together in the same movie."; - - NEDAlgo_HITS agdistis = new NEDAlgo_HITS(); - Document d = GetDisambiguation.textToDocument(preAnnotatedText); - agdistis.run(d, null); - - NamedEntitiesInText namedEntities = d.getNamedEntitiesInText(); - HashMap results = new HashMap(); - for (NamedEntityInText namedEntity : namedEntities) { - String disambiguatedURL = namedEntity.getNamedEntityUri(); - results.put(namedEntity, disambiguatedURL); - } - for (NamedEntityInText namedEntity : namedEntities) { - String disambiguatedURL = namedEntity.getNamedEntityUri(); - System.out.println(namedEntity.getLabel() + " -> " + disambiguatedURL); - assertTrue(correct.get(namedEntity.getLabel()).equals(disambiguatedURL)); - } - } - } + static String CFG = "config/agdistis.properties"; + + @Test + public void testUmlaute() throws InterruptedException, IOException { + final String taisho = "Emperor Taishō"; + final String taishoURL = "http://dbpedia.org/resource/Emperor_Taishō"; + final String japan = "Japan"; + final String japanURL = "http://dbpedia.org/resource/Japan"; + + final HashMap correct = new HashMap(); + correct.put(taisho, taishoURL); + correct.put(japan, japanURL); + final String preAnnotatedText = + "" + taisho + " was the 123rd Emperor of " + japan + "."; + + final NEDAlgo_HITS agdistis = new NEDAlgo_HITS(AGDISTISTest.CFG); + final Document d = GetDisambiguation.textToDocument(preAnnotatedText); + agdistis.run(d, null); + + final NamedEntitiesInText namedEntities = d.getNamedEntitiesInText(); + final HashMap results = new HashMap(); + for (final NamedEntityInText namedEntity : namedEntities) { + final String disambiguatedURL = namedEntity.getNamedEntityUri(); + results.put(namedEntity, disambiguatedURL); + } + for (final NamedEntityInText namedEntity : namedEntities) { + final String disambiguatedURL = namedEntity.getNamedEntityUri(); + System.out.println(namedEntity.getLabel() + " -> " + disambiguatedURL); + assertTrue(correct.get(namedEntity.getLabel()).equals(disambiguatedURL)); + } + + } + + @Test + public void testMinimalExample() throws InterruptedException, IOException { + final String obama = "Barack Obama"; + final String obamaURL = "http://dbpedia.org/resource/Barack_Obama"; + final String merkel = "Angela Merkel"; + final String merkelURL = "http://dbpedia.org/resource/Angela_Merkel"; + final String city = "Berlin"; + final String cityURL = "http://dbpedia.org/resource/Berlin"; + + final HashMap correct = new HashMap(); + correct.put(obama, obamaURL); + correct.put(merkel, merkelURL); + correct.put(city, cityURL); + + final String preAnnotatedText = "" + obama + " visits " + merkel + + " in " + city + "."; + + final NEDAlgo_HITS agdistis = new NEDAlgo_HITS(AGDISTISTest.CFG); + final Document d = GetDisambiguation.textToDocument(preAnnotatedText); + agdistis.run(d, null); + + final NamedEntitiesInText namedEntities = d.getNamedEntitiesInText(); + final HashMap results = new HashMap(); + for (final NamedEntityInText namedEntity : namedEntities) { + final String disambiguatedURL = namedEntity.getNamedEntityUri(); + results.put(namedEntity, disambiguatedURL); + } + for (final NamedEntityInText namedEntity : namedEntities) { + final String disambiguatedURL = namedEntity.getNamedEntityUri(); + System.out.println(namedEntity.getLabel() + " -> " + disambiguatedURL); + assertTrue(correct.get(namedEntity.getLabel()).equals(disambiguatedURL)); + } + + } + + @Test + /** + * This test ensures that entities consiting only of punctations, which are later reduced to a + * string of length 0 by + * https://github.com/dice-group/AGDISTIS/blob/master/src/main/java/org/aksw/agdistis/util/ + * Stemming.java#L91 are not causing exceptions + * + * @throws InterruptedException + * @throws IOException + */ + public void testinterpunctation() throws InterruptedException, IOException { + final String question = "???"; + final String questionURL = "???"; + + final HashMap correct = new HashMap(); + correct.put(question, questionURL); + + final String preAnnotatedText = "???."; + + final NEDAlgo_HITS agdistis = new NEDAlgo_HITS(AGDISTISTest.CFG); + final Document d = GetDisambiguation.textToDocument(preAnnotatedText); + agdistis.run(d, null); + + final NamedEntitiesInText namedEntities = d.getNamedEntitiesInText(); + final HashMap results = new HashMap(); + for (final NamedEntityInText namedEntity : namedEntities) { + final String disambiguatedURL = namedEntity.getNamedEntityUri(); + results.put(namedEntity, disambiguatedURL); + } + for (final NamedEntityInText namedEntity : namedEntities) { + final String disambiguatedURL = namedEntity.getNamedEntityUri(); + System.out.println(namedEntity.getLabel() + " -> " + disambiguatedURL); + assertTrue(correct.get(namedEntity.getLabel()).equals(disambiguatedURL)); + } + + } + + @Test + public void testContext() throws InterruptedException, IOException { + final String angelina = "Angelina"; + final String angelinaURL = "http://dbpedia.org/resource/Angelina_Jolie"; + final String brad = "Brad"; + final String bradURL = "http://dbpedia.org/resource/Brad_Pitt"; + final String jon = "Jon"; + final String jonURL = "http://dbpedia.org/resource/Jon_Voight"; + + // load properties to see if the context index exists + final Properties prop = new Properties(); + final InputStream input = + CandidateUtil.class.getResourceAsStream("/config/agdistis.properties"); + prop.load(input); + final String envContext = System.getenv("AGDISTIS_CONTEXT"); + final Boolean context = + Boolean.valueOf(envContext != null ? envContext : prop.getProperty("context")); + if (context == true) { // in case the index by context exist + + final HashMap correct = new HashMap(); + correct.put(angelina, angelinaURL); + correct.put(jon, jonURL); + correct.put(brad, bradURL); + + final String preAnnotatedText = "" + angelina + ", her father " + jon + + ", and her partner " + brad + + " never played together in the same movie."; + + final NEDAlgo_HITS agdistis = new NEDAlgo_HITS(AGDISTISTest.CFG); + final Document d = GetDisambiguation.textToDocument(preAnnotatedText); + agdistis.run(d, null); + + final NamedEntitiesInText namedEntities = d.getNamedEntitiesInText(); + final HashMap results = new HashMap(); + for (final NamedEntityInText namedEntity : namedEntities) { + final String disambiguatedURL = namedEntity.getNamedEntityUri(); + results.put(namedEntity, disambiguatedURL); + } + for (final NamedEntityInText namedEntity : namedEntities) { + final String disambiguatedURL = namedEntity.getNamedEntityUri(); + System.out.println(namedEntity.getLabel() + " -> " + disambiguatedURL); + assertTrue(correct.get(namedEntity.getLabel()).equals(disambiguatedURL)); + } + } + } } diff --git a/src/test/java/TripleIndexCreatorTest.java b/src/test/java/TripleIndexCreatorTest.java index 0d7f814..9013746 100644 --- a/src/test/java/TripleIndexCreatorTest.java +++ b/src/test/java/TripleIndexCreatorTest.java @@ -1,19 +1,12 @@ -import static org.junit.Assert.assertTrue; - import java.io.File; import java.io.IOException; -import java.io.InputStream; import java.util.HashMap; -import java.util.List; -import java.util.Properties; -import org.aksw.agdistis.algorithm.CandidateUtil; import org.aksw.agdistis.algorithm.NEDAlgo_HITS; import org.aksw.agdistis.datatypes.Document; import org.aksw.agdistis.datatypes.NamedEntitiesInText; import org.aksw.agdistis.datatypes.NamedEntityInText; -import org.aksw.agdistis.util.Triple; import org.aksw.agdistis.util.TripleIndex; import org.aksw.agdistis.util.TripleIndexCreator; import org.aksw.agdistis.webapp.GetDisambiguation; @@ -27,107 +20,108 @@ public class TripleIndexCreatorTest { - Logger log = LoggerFactory.getLogger(TripleIndexCreatorTest.class); - private TripleIndex index; - - @Before - public void init() { - try { - index = new TripleIndex(); - - } catch (IOException e) { - log.error( - "Can not load index or DBpedia repository due to either wrong properties in agdistis.properties or missing index at location", - e); - } - } - - @After - public void close() { - try { - index.close(); - } catch (IOException e) { - log.error( - "Can not load index or DBpedia repository due to either wrong properties in agdistis.properties or missing index at location", - e); - } - } - - @Test - /** - * tests https://github.com/dice-group/AGDISTIS/issues/46 if we need to return - * URIs from two different KBs AGDISTIS returns breaks - * - * @throws IOException - */ - public void testMinimalOntologyExample() throws IOException { - // load test data into index - TripleIndexCreator tic = new TripleIndexCreator(); - File file = new File("src/test/resources/test_evertec.ttl"); - File folder = new File("src/test/resources/evertec"); - if (folder.exists()) { - folder.delete(); - } - tic.createIndex(Lists.newArrayList(file), folder.getAbsolutePath(), null); - - // set the properties correctly - - NEDAlgo_HITS agdistis = new NEDAlgo_HITS(); - agdistis.setNodeType("http://fairhair.ai/kg/resource/"); - agdistis.setEdgeType("http://dbpedia.org/ontology/"); - - // load index - index.setIndex(folder.getAbsolutePath()); - agdistis.setIndex(index); - - // test index - String taisho = "Evertec"; - String taishoURL = "http://fairhair.ai/kg/resource/Evertec"; - String japan = "Puerto Rico"; - String japanURL = "http://dbpedia.org/resource/Puerto_Rico"; - - HashMap correct = new HashMap(); - correct.put(taisho, taishoURL); - correct.put(japan, japanURL); - String preAnnotatedText = "" + taisho + " is a company in" + japan + "."; - - Document d = GetDisambiguation.textToDocument(preAnnotatedText); - agdistis.run(d, null); - - NamedEntitiesInText namedEntities = d.getNamedEntitiesInText(); - HashMap results = new HashMap(); - for (NamedEntityInText namedEntity : namedEntities) { - String disambiguatedURL = namedEntity.getNamedEntityUri(); - System.out.println(namedEntity); - results.put(namedEntity, disambiguatedURL); - } - for (NamedEntityInText namedEntity : results.keySet()) { - String disambiguatedURL = namedEntity.getNamedEntityUri(); - System.out.println(namedEntity.getLabel() + " -> " + results.get(namedEntity)); - // TODO comment that line in - // assertTrue(correct.get(namedEntity.getLabel()).equals(disambiguatedURL)); - } - - // TODO if we change the node type, Puerto Rico gets found but not Evertec - agdistis.setNodeType("http://dbpedia.org/resource/"); - - d = GetDisambiguation.textToDocument(preAnnotatedText); - agdistis.run(d, null); - - namedEntities = d.getNamedEntitiesInText(); - results = new HashMap(); - for (NamedEntityInText namedEntity : namedEntities) { - String disambiguatedURL = namedEntity.getNamedEntityUri(); - System.out.println(namedEntity); - results.put(namedEntity, disambiguatedURL); - } - for (NamedEntityInText namedEntity : results.keySet()) { - String disambiguatedURL = namedEntity.getNamedEntityUri(); - System.out.println(namedEntity.getLabel() + " -> " + results.get(namedEntity)); - // TODO comment that line in - // assertTrue(correct.get(namedEntity.getLabel()).equals(disambiguatedURL)); - } - - } + Logger log = LoggerFactory.getLogger(TripleIndexCreatorTest.class); + private TripleIndex index; + + @Before + public void init() { + try { + index = new TripleIndex(AGDISTISTest.CFG); + + } catch (final IOException e) { + log.error( + "Can not load index or DBpedia repository due to either wrong properties in agdistis.properties or missing index at location", + e); + } + } + + @After + public void close() { + try { + index.close(); + } catch (final IOException e) { + log.error( + "Can not load index or DBpedia repository due to either wrong properties in agdistis.properties or missing index at location", + e); + } + } + + @Test + /** + * tests https://github.com/dice-group/AGDISTIS/issues/46 if we need to return URIs from two + * different KBs AGDISTIS returns breaks + * + * @throws IOException + */ + public void testMinimalOntologyExample() throws IOException { + // load test data into index + final TripleIndexCreator tic = new TripleIndexCreator(); + final File file = new File("src/test/resources/test_evertec.ttl"); + final File folder = new File("src/test/resources/evertec"); + if (folder.exists()) { + folder.delete(); + } + tic.createIndex(Lists.newArrayList(file), folder.getAbsolutePath(), null); + + // set the properties correctly + + final NEDAlgo_HITS agdistis = new NEDAlgo_HITS(AGDISTISTest.CFG); + agdistis.setNodeType("http://fairhair.ai/kg/resource/"); + agdistis.setEdgeType("http://dbpedia.org/ontology/"); + + // load index + index.setIndex(folder.getAbsolutePath()); + agdistis.setIndex(index); + + // test index + final String taisho = "Evertec"; + final String taishoURL = "http://fairhair.ai/kg/resource/Evertec"; + final String japan = "Puerto Rico"; + final String japanURL = "http://dbpedia.org/resource/Puerto_Rico"; + + final HashMap correct = new HashMap(); + correct.put(taisho, taishoURL); + correct.put(japan, japanURL); + final String preAnnotatedText = + "" + taisho + " is a company in" + japan + "."; + + Document d = GetDisambiguation.textToDocument(preAnnotatedText); + agdistis.run(d, null); + + NamedEntitiesInText namedEntities = d.getNamedEntitiesInText(); + HashMap results = new HashMap(); + for (final NamedEntityInText namedEntity : namedEntities) { + final String disambiguatedURL = namedEntity.getNamedEntityUri(); + System.out.println(namedEntity); + results.put(namedEntity, disambiguatedURL); + } + for (final NamedEntityInText namedEntity : results.keySet()) { + namedEntity.getNamedEntityUri(); + System.out.println(namedEntity.getLabel() + " -> " + results.get(namedEntity)); + // TODO comment that line in + // assertTrue(correct.get(namedEntity.getLabel()).equals(disambiguatedURL)); + } + + // TODO if we change the node type, Puerto Rico gets found but not Evertec + agdistis.setNodeType("http://dbpedia.org/resource/"); + + d = GetDisambiguation.textToDocument(preAnnotatedText); + agdistis.run(d, null); + + namedEntities = d.getNamedEntitiesInText(); + results = new HashMap(); + for (final NamedEntityInText namedEntity : namedEntities) { + final String disambiguatedURL = namedEntity.getNamedEntityUri(); + System.out.println(namedEntity); + results.put(namedEntity, disambiguatedURL); + } + for (final NamedEntityInText namedEntity : results.keySet()) { + namedEntity.getNamedEntityUri(); + System.out.println(namedEntity.getLabel() + " -> " + results.get(namedEntity)); + // TODO comment that line in + // assertTrue(correct.get(namedEntity.getLabel()).equals(disambiguatedURL)); + } + + } } diff --git a/src/test/java/TripleIndexTest.java b/src/test/java/TripleIndexTest.java index b6790a4..889ffb0 100755 --- a/src/test/java/TripleIndexTest.java +++ b/src/test/java/TripleIndexTest.java @@ -15,124 +15,130 @@ public class TripleIndexTest { - Logger log = LoggerFactory.getLogger(TripleIndexTest.class); - private TripleIndex index; - - @Before - public void init() { - try { - index = new TripleIndex(); - - } catch (IOException e) { - log.error( - "Can not load index or DBpedia repository due to either wrong properties in agdistis.properties or missing index at location", - e); - } - } - - @After - public void close() { - try { - index.close(); - } catch (IOException e) { - log.error( - "Can not load index or DBpedia repository due to either wrong properties in agdistis.properties or missing index at location", - e); - } - } - - @Test - public void testRedirects() { - String candidateURL = "http://dbpedia.org/resource/Barack_H_Obama_Junior"; - List redirect = index.search(candidateURL, "http://dbpedia.org/ontology/wikiPageRedirects", null); - for (Triple t : redirect) { - log.debug(t.toString()); - } - assertTrue(redirect.size() == 1); - candidateURL = "http://dbpedia.org/resource/Barack_Obama"; - redirect = index.search(candidateURL, "http://dbpedia.org/ontology/wikiPageRedirects", null); - assertTrue(redirect.size() == 0); - } - - @Test - public void testDisambiguation() { - String candidateURL = "http://dbpedia.org/resource/Bama"; - List dis = index.search(candidateURL, "http://dbpedia.org/ontology/wikiPageDisambiguates", null); - assertTrue(dis.size() > 0); - for (Triple t : dis) { - log.debug(t.toString()); - } - } - - @Test - public void testType() { - String candidateURL = "http://dbpedia.org/resource/Barack_Obama"; - List type = index.search(candidateURL, "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", null); - assertTrue(type.size() > 0); - for (Triple t : type) { - log.debug(t.toString()); - } - } - - @Test - public void testRdfsLabel() { - // String candidateURL = "http://dbpedia.org/resource/Berlin"; - String candidateURL = "http://dbpedia.org/resource/Barack_Obama"; - List type = index.search(candidateURL, "http://www.w3.org/2000/01/rdf-schema#label", null); - assertTrue(type.size() > 0); - for (Triple t : type) { - log.debug(t.toString()); - } - } - - @Test - public void testSurfaceForm() { - String candidateURL = "http://dbpedia.org/resource/Jon_Voight"; - List type = index.search(candidateURL, null, null); - assertTrue(type.size() > 0); - for (Triple t : type) { - log.debug(t.toString()); - } - } - - /* - * @Test public void testAcronym() { List type = index.search(null, - * "http://dbpedia.org/property/acronym", null); assertTrue(type.size() > - * 0); for (Triple t : type) { log.debug(t.toString()); } } - */ - @Test - public void testMultipleTermsPerField() { - String candidate = "Berlin"; - List type = index.search(null, null, candidate); - assertTrue(type.size() > 1); - for (Triple t : type) { - log.debug(t.toString()); - } - } - - @Test - public void testdirectLink() { - String candidate = "http://dbpedia.org/resource/Angelina_Jolie"; - String candidate2 = "http://dbpedia.org/resource/Jon_Voight"; - List type = index.search(candidate, null, candidate2); - assertTrue(type.size() >= 1); - for (Triple t : type) { - log.debug(t.toString()); - } - } - - @Test - public void testSurfaceFormsDistance() { - String candidateURL = "http://dbpedia.org/resource/Barack_Obama"; - List label = index.search(candidateURL, "http://www.w3.org/2000/01/rdf-schema#label", null); - List surfaceForms = index.search(candidateURL, "http://www.w3.org/2004/02/skos/core#altLabel", null); - log.debug(" * " + surfaceForms.size()); - NGramDistance n = new NGramDistance(3); - for (Triple t : surfaceForms) { - log.debug(label.get(0).getObject() + " " + t.getObject() + " : " - + n.getDistance(label.get(0).getObject(), t.getObject())); - assertTrue(n.getDistance(label.get(0).getObject(), t.getObject()) >= 0); - - } - } + Logger log = LoggerFactory.getLogger(TripleIndexTest.class); + private TripleIndex index; + + @Before + public void init() { + try { + index = new TripleIndex(AGDISTISTest.CFG); + + } catch (final IOException e) { + log.error( + "Can not load index or DBpedia repository due to either wrong properties in agdistis.properties or missing index at location", + e); + } + } + + @After + public void close() { + try { + index.close(); + } catch (final IOException e) { + log.error( + "Can not load index or DBpedia repository due to either wrong properties in agdistis.properties or missing index at location", + e); + } + } + + @Test + public void testRedirects() { + String candidateURL = "http://dbpedia.org/resource/Barack_H_Obama_Junior"; + List redirect = + index.search(candidateURL, "http://dbpedia.org/ontology/wikiPageRedirects", null); + for (final Triple t : redirect) { + log.debug(t.toString()); + } + assertTrue(redirect.size() == 1); + candidateURL = "http://dbpedia.org/resource/Barack_Obama"; + redirect = index.search(candidateURL, "http://dbpedia.org/ontology/wikiPageRedirects", null); + assertTrue(redirect.size() == 0); + } + + @Test + public void testDisambiguation() { + final String candidateURL = "http://dbpedia.org/resource/Bama"; + final List dis = + index.search(candidateURL, "http://dbpedia.org/ontology/wikiPageDisambiguates", null); + assertTrue(dis.size() > 0); + for (final Triple t : dis) { + log.debug(t.toString()); + } + } + + @Test + public void testType() { + final String candidateURL = "http://dbpedia.org/resource/Barack_Obama"; + final List type = + index.search(candidateURL, "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", null); + assertTrue(type.size() > 0); + for (final Triple t : type) { + log.debug(t.toString()); + } + } + + @Test + public void testRdfsLabel() { + // String candidateURL = "http://dbpedia.org/resource/Berlin"; + final String candidateURL = "http://dbpedia.org/resource/Barack_Obama"; + final List type = + index.search(candidateURL, "http://www.w3.org/2000/01/rdf-schema#label", null); + assertTrue(type.size() > 0); + for (final Triple t : type) { + log.debug(t.toString()); + } + } + + @Test + public void testSurfaceForm() { + final String candidateURL = "http://dbpedia.org/resource/Jon_Voight"; + final List type = index.search(candidateURL, null, null); + assertTrue(type.size() > 0); + for (final Triple t : type) { + log.debug(t.toString()); + } + } + + /* + * @Test public void testAcronym() { List type = index.search(null, + * "http://dbpedia.org/property/acronym", null); assertTrue(type.size() > 0); for (Triple t : + * type) { log.debug(t.toString()); } } + */ + @Test + public void testMultipleTermsPerField() { + final String candidate = "Berlin"; + final List type = index.search(null, null, candidate); + assertTrue(type.size() > 1); + for (final Triple t : type) { + log.debug(t.toString()); + } + } + + @Test + public void testdirectLink() { + final String candidate = "http://dbpedia.org/resource/Angelina_Jolie"; + final String candidate2 = "http://dbpedia.org/resource/Jon_Voight"; + final List type = index.search(candidate, null, candidate2); + assertTrue(type.size() >= 1); + for (final Triple t : type) { + log.debug(t.toString()); + } + } + + @Test + public void testSurfaceFormsDistance() { + final String candidateURL = "http://dbpedia.org/resource/Barack_Obama"; + final List label = + index.search(candidateURL, "http://www.w3.org/2000/01/rdf-schema#label", null); + final List surfaceForms = + index.search(candidateURL, "http://www.w3.org/2004/02/skos/core#altLabel", null); + log.debug(" * " + surfaceForms.size()); + final NGramDistance n = new NGramDistance(3); + for (final Triple t : surfaceForms) { + log.debug(label.get(0).getObject() + " " + t.getObject() + " : " + + n.getDistance(label.get(0).getObject(), t.getObject())); + assertTrue(n.getDistance(label.get(0).getObject(), t.getObject()) >= 0); + + } + } }