zotero · zuphilip · May 29, 2016 · May 30, 2016 · May 19, 2018 · May 19, 2018
diff --git a/Embedded Metadata.js b/Embedded Metadata.js
@@ -9,7 +9,7 @@
 	"inRepository": true,
 	"translatorType": 4,
 	"browserSupport": "gcsibv",
-	"lastUpdated": "2018-10-07 16:46:47"
+	"lastUpdated": "2018-10-09 21:18:00"
 }
 
 /*
@@ -152,7 +152,7 @@ function getPrefixes(doc) {
 
 	//also look in html and head elements
 	var prefixes = (doc.documentElement.getAttribute('prefix') || '')
-		+ (doc.head.getAttribute('prefix') || '');
+		+ (doc.head ? doc.head.getAttribute('prefix') || '': '');
 	var prefixRE = /(\w+):\s+(\S+)/g;
 	var m;
 	while(m = prefixRE.exec(prefixes)) {
@@ -319,6 +319,120 @@ function init(doc, url, callback, forceLoadRDF) {
 			}
 		}
 	}
+
+
+	// Scraping methods for microdata.
+
+	var schemaItems = ZU.xpath(doc, '//*[@itemscope]');
+	// Some websites have also data in itemprop outside any itemscope (sigh)
+	var overallDocument = ZU.xpath(doc, '//html[not(@itemscope)]');
+	if (overallDocument.length>0) {
+		schemaItems[schemaItems.length] = overallDocument[0];
+	}
+
+	var auxiliaryItems = ["http://schema.org/Person",
+		"http://schema.org/Organization", "http://schema.org/Place",
+		"http://schema.org/PostalAddress"];
+	var structuralTypes = ["http://schema.org/BreadcrumbList",
+		"http://schema.org/ListItem", "https://schema.org/ImageObject",
+		"http://schema.org/SiteNavigationElement", "http://schema.org/WPAdBlock"];
+	// Traverse in reverse order to delete the elements of structural types
+	for (let i=schemaItems.length-1; i>=0; i--) {
+		let itemType = schemaItems[i].getAttribute("itemtype");
+		if (itemType) itemType = itemType.replace("https://", "http://")
+		//Z.debug(itemType)
+		if (structuralTypes.includes(itemType)) {
+			schemaItems.splice(i, 1);
+			continue;
+		}
+		// Assign the itemid to each item, which is by default the url,
+		// but auxiliary items like person, organization need a different itemid.
+		if (auxiliaryItems.includes(itemType)) {
+			schemaItems[i].itemid = url+"#itemid="+i;
+		} else {
+			schemaItems[i].itemid = url;
+		}
+	}
+
+	function microdataValue(propertyNode) {
+		//see also https://www.w3.org/TR/microdata/#values
+		if (propertyNode.hasAttribute("itemscope")) {
+			return {itemid: true, value: propertyNode.itemid};
+		}
+		switch(propertyNode.tagName.toLowerCase()) {
+			case "meta":
+				return propertyNode.getAttribute("content");
+			case "audio":
+			case "embed":
+			case "iframe":
+			case "img":
+			case "source":
+			case "track":
+			case "video":
+				return propertyNode.getAttribute("src");
+			case "a":
+			case "area":
+			case "link":
+				return propertyNode.getAttribute("href");
+			case "object":
+				return propertyNode.getAttribute("data");
+			case "data":
+			case "meter":
+				return propertyNode.getAttribute("value");
+			case "time":
+				return propertyNode.getAttribute("datetime");
+			case "span"://non-standard, but can occur
+				if (propertyNode.childNodes.length > 1 && propertyNode.getAttribute("content")) {
+					return propertyNode.getAttribute("content");
+				}
+			default:
+				return propertyNode.textContent;
+		}
+	}
+
+
+	for (var i=0; i<schemaItems.length; i++) {
+		var refs = schemaItems[i].getAttribute("itemref");//Currently itemref are not handled
+
+		var typesList = schemaItems[i].getAttribute("itemtype");
+		var baseUrl = "";
+		if (typesList) {
+			//if (structuralTypes.includes(typesList)) continue;
+			var types = typesList.split(" ");
+			for (var k=0; k<types.length; k++) {
+				types[k] = types[k].replace("https://", "http://");
+				statements.push([schemaItems[i].itemid, _prefixes.rdf+"type", types[k]]);
+			}
+			var endSignal = Math.max(types[0].lastIndexOf('/'), types[0].lastIndexOf('#'));
+			baseUrl = types[0].substr(0, endSignal+1);
+		} else {
+			baseUrl = "http://schema.org/";
+		}
+
+		//get all properties
+		var properties = ZU.xpath(schemaItems[i], './/*[@itemprop]');
+		var exclude = ZU.xpath(schemaItems[i], './/*[@itemscope]//*[@itemprop]');
+		for (let j=0; j<properties.length; j++) {
+			if (exclude.indexOf(properties[j]) == -1) {
+				var propertyList = properties[j].getAttribute("itemprop");
+				var propertyValue = microdataValue(properties[j]) || "";
+				// A common error is to put the author data in a A-tag, which
+				// then consequently gets evaluated to the src-parameter.
+				if (propertyList == "author" && properties[j].tagName == "A") {
+					propertyValue = properties[j].textContent;
+				}
+				//it is possible to assign the same value to multiple
+				//properties (separated by space) at the same time
+				var propertyNames = propertyList.split(" ");
+				for (let k=0; k<propertyNames.length; k++) {
+					statements.push([schemaItems[i].itemid, baseUrl+propertyNames[k], propertyValue]);
+				}
+			}
+		}
+	}
+
+	// For debugging microdata parsing
+	//Z.debug(statements);
 
 	if(statements.length || forceLoadRDF) {
 		// load RDF translator, so that we don't need to replicate import code
@@ -331,14 +445,19 @@ function init(doc, url, callback, forceLoadRDF) {
 		});
 
 		translator.getTranslatorObject(function(rdf) {
-			for(var i=0; i<statements.length; i++) {
-				var statement = statements[i];
-				rdf.Zotero.RDF.addStatement(statement[0], statement[1], statement[2], true);
+			for (let statement of statements) {
+				if (statement[2].itemid) {
+					if (statement[2].value) {
+						rdf.Zotero.RDF.addStatement(statement[0], statement[1], statement[2].value, false);
+					}
+				} else {
+					rdf.Zotero.RDF.addStatement(statement[0], statement[1], statement[2], true);
+				}
 			}
 			var nodes = rdf.getNodes(true);
 			rdf.defaultUnknownType = hwType || hwTypeGuess || generatorType ||
 				//if we have RDF data, then default to webpage
-				(nodes.length ? "webpage":false);
+				(nodes.length ? "webpage" : false);
 
 			//if itemType is overridden, no reason to run RDF.detectWeb
 			if(exports.itemType) {
@@ -1467,6 +1586,83 @@ var testCases = [
 				"seeAlso": []
 			}
 		]
+	},
+	{
+		"type": "web",
+		"url": "https://www.haaretz.com/islamic-jihad-if-hunger-striker-dies-we-ll-respond-with-force-1.5387076",
+		"items": [
+			{
+				"itemType": "newspaperArticle",
+				"title": "Islamic Jihad: If hunger striker dies, we'll respond with force against Israel",
+				"creators": [
+					{
+						"firstName": "Jack",
+						"lastName": "Khoury",
+						"creatorType": "author"
+					},
+					{
+						"firstName": "Shirly",
+						"lastName": "Seidler",
+						"creatorType": "author"
+					},
+					{
+						"firstName": "Ido",
+						"lastName": "Efrati",
+						"creatorType": "author"
+					}
+				],
+				"date": "2015-08-14T19:26:10+0300",
+				"abstractNote": "Islamic Jihad says it will no longer be committed to maintaining calm if Mohammed Allaan, who lost consciousness after 60-day hunger strike, dies.",
+				"libraryCatalog": "www.haaretz.com",
+				"publicationTitle": "haaretz.com",
+				"shortTitle": "Islamic Jihad",
+				"url": "https://www.haaretz.com/islamic-jihad-if-hunger-striker-dies-we-ll-respond-with-force-1.5387076",
+				"attachments": [
+					{
+						"title": "Snapshot"
+					}
+				],
+				"tags": [],
+				"notes": [],
+				"seeAlso": []
+			}
+		]
+	},
+	{
+		"type": "web",
+		"url": "https://www.zeit.de/politik/ausland/2011-09/libyen-bani-walid",
+		"items": [
+			{
+				"itemType": "webpage",
+				"title": "Libyen: Rebellen bereiten Angriff auf Bani Walid vor",
+				"creators": [
+					{
+						"firstName": "",
+						"lastName": "AFP",
+						"creatorType": "author"
+					},
+					{
+						"firstName": "",
+						"lastName": "dpa",
+						"creatorType": "author"
+					}
+				],
+				"date": "2011-09-04T14:55:40+02:00",
+				"abstractNote": "Die von Gadhafi-Anhängern geführte Stadt ist von Rebellentruppen eingekreist. Gespräche über eine friedliche Übergabe sind gescheitert, ein Angriff steht offenbar bevor.",
+				"language": "de",
+				"shortTitle": "Libyen",
+				"url": "https://www.zeit.de/politik/ausland/2011-09/libyen-bani-walid",
+				"websiteTitle": "ZEIT ONLINE",
+				"attachments": [
+					{
+						"title": "Snapshot"
+					}
+				],
+				"tags": [],
+				"notes": [],
+				"seeAlso": []
+			}
+		]
 	}
 ]
 /** END TEST CASES **/
diff --git a/RDF.js b/RDF.js
@@ -13,7 +13,7 @@
 	"inRepository": true,
 	"translatorType": 1,
 	"browserSupport": "gcsibv",
-	"lastUpdated": "2018-10-07 16:32:26"
+	"lastUpdated": "2018-10-09 21:27:00"
 }
 
 /*
@@ -385,6 +385,8 @@ function detectType(newItem, node, ret) {
 				break;
 			case 'discussionforumposting':
 				t.so = 'forumPost'; break;
+			case 'qapage'://e.g. stackoverflow
+				t.soGuess = 'forumPost'; break;
 			case 'techarticle':
 			case 'apireference':
 				t.soGuess = 'report'; break;
@@ -429,6 +431,13 @@ function detectType(newItem, node, ret) {
 			case 'datacatalog':
 			case 'dataset':
 				t.so = 'journalArticle'; break;  //until dataset gets implemented
+			// We don't want to detect items from these auxiliary containers,
+			// and therefore we return false in these cases.
+			case 'person':
+			case 'organization':
+			case 'place':
+			case 'postaladdress':
+				return false;
 
 			// specials cases
 			case "article":
@@ -680,6 +689,9 @@ function detectType(newItem, node, ret) {
 		case "music.album":
 			t.og = "audioRecording";
 		break;
+		case "slideshare:presentation":
+			t.og = "presentation";
+		break;
 		case "website":
 			t.og = "webpage";
 		break;
@@ -870,15 +882,17 @@ function importItem(newItem, node) {
 				n.so+creatorType]);
 		//get presenters in unpublished conference papers on eprints
 		} else if (creatorType == "presenter") {
-			creators = getFirstResults(node, [n.z+creatorType+"s", n.eprints+"creators_name"]);
+			creators = getFirstResults(node, [n.z+creatorType+"s", n.eprints+"creators_name", n.so+"author"]);
+		} else if (creatorType == "director") {
+			creators = getFirstResults(node, [n.so+"director", n.z+creatorType+"s"]);
 		} else if (creatorType == "castMember") {
-			creators = getFirstResults(node, [n.video+"actor"]);
+			creators = getFirstResults(node, [n.video+"actor", n.so+"actor", n.so+"actors", n.z+creatorType+"s"]);
 		} else if (creatorType == "scriptwriter") {
-			creators = getFirstResults(node, [n.video+"writer"]);
+			creators = getFirstResults(node, [n.video+"writer", n.z+creatorType+"s"]);
 		} else if (creatorType == "producer") {
-			creators = getFirstResults(node, [n.so+"producer"]);
+			creators = getFirstResults(node, [n.so+"producer", n.z+creatorType+"s"]);
 		} else if (creatorType == "programmer") {
-			creators = getFirstResults(node, [n.so+"author", n.codemeta+"maintainer"]);
+			creators = getFirstResults(node, [n.so+"author", n.codemeta+"maintainer", n.z+creatorType+"s"]);
 		} else {
 			creators = getFirstResults(node, [n.z+creatorType+"s"]);
 		}
@@ -979,6 +993,14 @@ function importItem(newItem, node) {
 		if (typeof(publisher[0]) == "string") {
 			newItem.publisher = publisher[0];
 		} else {
+			// Publisher can be in another container
+			try {
+				p = Zotero.RDF.getContainerElements(publisher[0]);
+			} catch(e) {}
+			if (p && p.length) {
+				publisher = p;
+			}
+
 			var type = Zotero.RDF.getTargets(publisher[0], rdf+"type");
 			if (type) {
 				type = Zotero.RDF.getResourceURI(type[0]);
@@ -1281,9 +1303,11 @@ function getNodes(skipCollections) {
 		// figure out if this is a part of another resource, or a linked
 		// attachment, or a creator
 		if (Zotero.RDF.getSources(node, n.dcterms+"isPartOf") ||
+		   Zotero.RDF.getSources(node, n.so+"isPartOf") ||
 		   Zotero.RDF.getSources(node, n.bib+"presentedAt") ||
 		   Zotero.RDF.getSources(node, n.link+"link") ||
-		   Zotero.RDF.getSources(node, n.dcterms+"creator")) {
+		   Zotero.RDF.getSources(node, n.dcterms+"creator") ||
+		   Zotero.RDF.getSources(node, n.so+"author")) {
 			continue;
 		}