Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Microdata Support in EM translator #1660

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
208 changes: 202 additions & 6 deletions Embedded Metadata.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"inRepository": true,
"translatorType": 4,
"browserSupport": "gcsibv",
"lastUpdated": "2018-10-07 16:46:47"
"lastUpdated": "2018-10-09 21:18:00"
}

/*
Expand Down Expand Up @@ -152,7 +152,7 @@ function getPrefixes(doc) {

//also look in html and head elements
var prefixes = (doc.documentElement.getAttribute('prefix') || '')
+ (doc.head.getAttribute('prefix') || '');
+ (doc.head ? doc.head.getAttribute('prefix') || '': '');
var prefixRE = /(\w+):\s+(\S+)/g;
var m;
while(m = prefixRE.exec(prefixes)) {
Expand Down Expand Up @@ -319,6 +319,120 @@ function init(doc, url, callback, forceLoadRDF) {
}
}
}


// Scraping methods for microdata.

var schemaItems = ZU.xpath(doc, '//*[@itemscope]');
// Some websites have also data in itemprop outside any itemscope (sigh)
var overallDocument = ZU.xpath(doc, '//html[not(@itemscope)]');
if (overallDocument.length>0) {
schemaItems[schemaItems.length] = overallDocument[0];
}

var auxiliaryItems = ["http://schema.org/Person",
"http://schema.org/Organization", "http://schema.org/Place",
"http://schema.org/PostalAddress"];
var structuralTypes = ["http://schema.org/BreadcrumbList",
"http://schema.org/ListItem", "https://schema.org/ImageObject",
"http://schema.org/SiteNavigationElement", "http://schema.org/WPAdBlock"];
// Traverse in reverse order to delete the elements of structural types
for (let i=schemaItems.length-1; i>=0; i--) {
let itemType = schemaItems[i].getAttribute("itemtype");
if (itemType) itemType = itemType.replace("https://", "http://")
//Z.debug(itemType)
if (structuralTypes.includes(itemType)) {
schemaItems.splice(i, 1);
continue;
}
// Assign the itemid to each item, which is by default the url,
// but auxiliary items like person, organization need a different itemid.
if (auxiliaryItems.includes(itemType)) {
schemaItems[i].itemid = url+"#itemid="+i;
} else {
schemaItems[i].itemid = url;
}
}

function microdataValue(propertyNode) {
//see also https://www.w3.org/TR/microdata/#values
if (propertyNode.hasAttribute("itemscope")) {
return {itemid: true, value: propertyNode.itemid};
}
switch(propertyNode.tagName.toLowerCase()) {
case "meta":
return propertyNode.getAttribute("content");
case "audio":
case "embed":
case "iframe":
case "img":
case "source":
case "track":
case "video":
return propertyNode.getAttribute("src");
case "a":
case "area":
case "link":
return propertyNode.getAttribute("href");
case "object":
return propertyNode.getAttribute("data");
case "data":
case "meter":
return propertyNode.getAttribute("value");
case "time":
return propertyNode.getAttribute("datetime");
case "span"://non-standard, but can occur
if (propertyNode.childNodes.length > 1 && propertyNode.getAttribute("content")) {
return propertyNode.getAttribute("content");
}
default:
return propertyNode.textContent;
}
}


for (var i=0; i<schemaItems.length; i++) {
var refs = schemaItems[i].getAttribute("itemref");//Currently itemref are not handled

var typesList = schemaItems[i].getAttribute("itemtype");
var baseUrl = "";
if (typesList) {
//if (structuralTypes.includes(typesList)) continue;
var types = typesList.split(" ");
for (var k=0; k<types.length; k++) {
types[k] = types[k].replace("https://", "http://");
statements.push([schemaItems[i].itemid, _prefixes.rdf+"type", types[k]]);
}
var endSignal = Math.max(types[0].lastIndexOf('/'), types[0].lastIndexOf('#'));
baseUrl = types[0].substr(0, endSignal+1);
} else {
baseUrl = "http://schema.org/";
}

//get all properties
var properties = ZU.xpath(schemaItems[i], './/*[@itemprop]');
var exclude = ZU.xpath(schemaItems[i], './/*[@itemscope]//*[@itemprop]');
for (let j=0; j<properties.length; j++) {
if (exclude.indexOf(properties[j]) == -1) {
var propertyList = properties[j].getAttribute("itemprop");
var propertyValue = microdataValue(properties[j]) || "";
// A common error is to put the author data in a A-tag, which
// then consequently gets evaluated to the src-parameter.
if (propertyList == "author" && properties[j].tagName == "A") {
propertyValue = properties[j].textContent;
}
//it is possible to assign the same value to multiple
//properties (separated by space) at the same time
var propertyNames = propertyList.split(" ");
for (let k=0; k<propertyNames.length; k++) {
statements.push([schemaItems[i].itemid, baseUrl+propertyNames[k], propertyValue]);
}
}
}
}

// For debugging microdata parsing
//Z.debug(statements);

if(statements.length || forceLoadRDF) {
// load RDF translator, so that we don't need to replicate import code
Expand All @@ -331,14 +445,19 @@ function init(doc, url, callback, forceLoadRDF) {
});

translator.getTranslatorObject(function(rdf) {
for(var i=0; i<statements.length; i++) {
var statement = statements[i];
rdf.Zotero.RDF.addStatement(statement[0], statement[1], statement[2], true);
for (let statement of statements) {
if (statement[2].itemid) {
if (statement[2].value) {
rdf.Zotero.RDF.addStatement(statement[0], statement[1], statement[2].value, false);
}
} else {
rdf.Zotero.RDF.addStatement(statement[0], statement[1], statement[2], true);
}
}
var nodes = rdf.getNodes(true);
rdf.defaultUnknownType = hwType || hwTypeGuess || generatorType ||
//if we have RDF data, then default to webpage
(nodes.length ? "webpage":false);
(nodes.length ? "webpage" : false);

//if itemType is overridden, no reason to run RDF.detectWeb
if(exports.itemType) {
Expand Down Expand Up @@ -1467,6 +1586,83 @@ var testCases = [
"seeAlso": []
}
]
},
{
"type": "web",
"url": "https://www.haaretz.com/islamic-jihad-if-hunger-striker-dies-we-ll-respond-with-force-1.5387076",
"items": [
{
"itemType": "newspaperArticle",
"title": "Islamic Jihad: If hunger striker dies, we'll respond with force against Israel",
"creators": [
{
"firstName": "Jack",
"lastName": "Khoury",
"creatorType": "author"
},
{
"firstName": "Shirly",
"lastName": "Seidler",
"creatorType": "author"
},
{
"firstName": "Ido",
"lastName": "Efrati",
"creatorType": "author"
}
],
"date": "2015-08-14T19:26:10+0300",
"abstractNote": "Islamic Jihad says it will no longer be committed to maintaining calm if Mohammed Allaan, who lost consciousness after 60-day hunger strike, dies.",
"libraryCatalog": "www.haaretz.com",
"publicationTitle": "haaretz.com",
"shortTitle": "Islamic Jihad",
"url": "https://www.haaretz.com/islamic-jihad-if-hunger-striker-dies-we-ll-respond-with-force-1.5387076",
"attachments": [
{
"title": "Snapshot"
}
],
"tags": [],
"notes": [],
"seeAlso": []
}
]
},
{
"type": "web",
"url": "https://www.zeit.de/politik/ausland/2011-09/libyen-bani-walid",
"items": [
{
"itemType": "webpage",
"title": "Libyen: Rebellen bereiten Angriff auf Bani Walid vor",
"creators": [
{
"firstName": "",
"lastName": "AFP",
"creatorType": "author"
},
{
"firstName": "",
"lastName": "dpa",
"creatorType": "author"
}
],
"date": "2011-09-04T14:55:40+02:00",
"abstractNote": "Die von Gadhafi-Anhängern geführte Stadt ist von Rebellentruppen eingekreist. Gespräche über eine friedliche Übergabe sind gescheitert, ein Angriff steht offenbar bevor.",
"language": "de",
"shortTitle": "Libyen",
"url": "https://www.zeit.de/politik/ausland/2011-09/libyen-bani-walid",
"websiteTitle": "ZEIT ONLINE",
"attachments": [
{
"title": "Snapshot"
}
],
"tags": [],
"notes": [],
"seeAlso": []
}
]
}
]
/** END TEST CASES **/
38 changes: 31 additions & 7 deletions RDF.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
"inRepository": true,
"translatorType": 1,
"browserSupport": "gcsibv",
"lastUpdated": "2018-10-07 16:32:26"
"lastUpdated": "2018-10-09 21:27:00"
}

/*
Expand Down Expand Up @@ -385,6 +385,8 @@ function detectType(newItem, node, ret) {
break;
case 'discussionforumposting':
t.so = 'forumPost'; break;
case 'qapage'://e.g. stackoverflow
t.soGuess = 'forumPost'; break;
case 'techarticle':
case 'apireference':
t.soGuess = 'report'; break;
Expand Down Expand Up @@ -429,6 +431,13 @@ function detectType(newItem, node, ret) {
case 'datacatalog':
case 'dataset':
t.so = 'journalArticle'; break; //until dataset gets implemented
// We don't want to detect items from these auxiliary containers,
// and therefore we return false in these cases.
case 'person':
case 'organization':
case 'place':
case 'postaladdress':
return false;

// specials cases
case "article":
Expand Down Expand Up @@ -680,6 +689,9 @@ function detectType(newItem, node, ret) {
case "music.album":
t.og = "audioRecording";
break;
case "slideshare:presentation":
t.og = "presentation";
break;
case "website":
t.og = "webpage";
break;
Expand Down Expand Up @@ -870,15 +882,17 @@ function importItem(newItem, node) {
n.so+creatorType]);
//get presenters in unpublished conference papers on eprints
} else if (creatorType == "presenter") {
creators = getFirstResults(node, [n.z+creatorType+"s", n.eprints+"creators_name"]);
creators = getFirstResults(node, [n.z+creatorType+"s", n.eprints+"creators_name", n.so+"author"]);
} else if (creatorType == "director") {
creators = getFirstResults(node, [n.so+"director", n.z+creatorType+"s"]);
} else if (creatorType == "castMember") {
creators = getFirstResults(node, [n.video+"actor"]);
creators = getFirstResults(node, [n.video+"actor", n.so+"actor", n.so+"actors", n.z+creatorType+"s"]);
} else if (creatorType == "scriptwriter") {
creators = getFirstResults(node, [n.video+"writer"]);
creators = getFirstResults(node, [n.video+"writer", n.z+creatorType+"s"]);
} else if (creatorType == "producer") {
creators = getFirstResults(node, [n.so+"producer"]);
creators = getFirstResults(node, [n.so+"producer", n.z+creatorType+"s"]);
} else if (creatorType == "programmer") {
creators = getFirstResults(node, [n.so+"author", n.codemeta+"maintainer"]);
creators = getFirstResults(node, [n.so+"author", n.codemeta+"maintainer", n.z+creatorType+"s"]);
} else {
creators = getFirstResults(node, [n.z+creatorType+"s"]);
}
Expand Down Expand Up @@ -979,6 +993,14 @@ function importItem(newItem, node) {
if (typeof(publisher[0]) == "string") {
newItem.publisher = publisher[0];
} else {
// Publisher can be in another container
try {
p = Zotero.RDF.getContainerElements(publisher[0]);
} catch(e) {}
if (p && p.length) {
publisher = p;
}

var type = Zotero.RDF.getTargets(publisher[0], rdf+"type");
if (type) {
type = Zotero.RDF.getResourceURI(type[0]);
Expand Down Expand Up @@ -1281,9 +1303,11 @@ function getNodes(skipCollections) {
// figure out if this is a part of another resource, or a linked
// attachment, or a creator
if (Zotero.RDF.getSources(node, n.dcterms+"isPartOf") ||
Zotero.RDF.getSources(node, n.so+"isPartOf") ||
Zotero.RDF.getSources(node, n.bib+"presentedAt") ||
Zotero.RDF.getSources(node, n.link+"link") ||
Zotero.RDF.getSources(node, n.dcterms+"creator")) {
Zotero.RDF.getSources(node, n.dcterms+"creator") ||
Zotero.RDF.getSources(node, n.so+"author")) {
continue;
}

Expand Down