From 1d944e69564ef099ae6eed1d9e9f4d212e77891a Mon Sep 17 00:00:00 2001 From: Dan Stillman Date: Thu, 26 Jul 2018 02:47:05 -0400 Subject: [PATCH] Move web translation back to /web and implement other /search modes I had consolidated URL handling into /search, but it didn't really make sense. Both web translation and text search can return a 300, but with different responses and different required handling. So clients should just parse out URLs, send those to /web (now as plain text), and send everything else to /search like with v1. Closes #6, closes #7 --- src/searchEndpoint.js | 99 +++-- src/server.js | 2 + src/textSearch.js | 505 ++++++++++++++++++++++++ src/webEndpoint.js | 97 +++++ src/{searchSession.js => webSession.js} | 14 +- test/server_search_test.js | 68 ---- test/web_test.js | 66 ++++ translate_search | 9 + translate_url | 2 +- translate_url_multiple | 2 +- 10 files changed, 735 insertions(+), 129 deletions(-) create mode 100644 src/textSearch.js create mode 100644 src/webEndpoint.js rename src/{searchSession.js => webSession.js} (98%) delete mode 100644 test/server_search_test.js create mode 100644 test/web_test.js create mode 100755 translate_search diff --git a/src/searchEndpoint.js b/src/searchEndpoint.js index 6fc80e0..53cf121 100644 --- a/src/searchEndpoint.js +++ b/src/searchEndpoint.js @@ -23,77 +23,74 @@ ***** END LICENSE BLOCK ***** */ -const SearchSession = require('./searchSession'); - -// Timeout for select requests, in seconds -//const SELECT_TIMEOUT = 120; -const SELECT_TIMEOUT = 15; -const sessionsWaitingForSelection = {}; +const config = require('config'); +const Translate = require('./translation/translate'); +const TextSearch = require('./textSearch'); var SearchEndpoint = module.exports = { - requestsSinceGC: 0, - handle: async function (ctx, next) { ctx.assert(ctx.is('text/plain') || ctx.is('json'), 415); - setTimeout(() => { - this.gc(); - }); - var data = ctx.request.body; if (!data) { ctx.throw(400, "POST data not provided\n"); } - // If follow-up request, retrieve session and update context - var query; - var session; - if (typeof data == 'object') { - let sessionID = data.session; - if (!sessionID) { - ctx.throw(400, "'session' not provided"); - } - session = sessionsWaitingForSelection[sessionID]; - if (!session) { - ctx.throw(400, "Session not found"); - } - delete sessionsWaitingForSelection[sessionID]; - session.ctx = ctx; - session.next = next; - session.data = data; - } - else { - session = new SearchSession(ctx, next, data); + // Look for DOI, ISBN, etc. + var identifiers = Zotero.Utilities.Internal.extractIdentifiers(data); + + // Use PMID only if it's the only text in the query + if (identifiers.length && identifiers[0].PMID && identifiers[0].PMID !== data.trim()) { + identifiers = []; } - // URL - if (typeof data == 'object' || data.match(/^https?:/)) { - await session.handleURL(); - - // Store session if returning multiple choices - if (ctx.response.status == 300) { - sessionsWaitingForSelection[session.id] = session; - } + // Text search + if (!identifiers.length) { + await TextSearch.handle(ctx, next); return; } - ctx.throw(501); + this.handleIdentifier(ctx, identifiers[0]); }, - /** - * Perform garbage collection every 10 requests - */ - gc: function () { - if ((++this.requestsSinceGC) == 3) { - for (let i in sessionsWaitingForSelection) { - let session = sessionsWaitingForSelection[i]; - if (session.started && Date.now() >= session.started + SELECT_TIMEOUT * 1000) { - delete sessionsWaitingForSelection[i]; - } + handleIdentifier: async function (ctx, identifier) { + // Identifier + try { + var translate = new Translate.Search(); + translate.setIdentifier(identifier); + let translators = await translate.getTranslators(); + if (!translators.length) { + ctx.throw(501, "No translators available", { expose: true }); + } + translate.setTranslator(translators); + + var items = await translate.translate({ + libraryID: false + }); + } + catch (e) { + if (e == translate.ERROR_NO_RESULTS) { + ctx.throw(501, e, { expose: true }); } - this.requestsSinceGC = 0; + + Zotero.debug(e, 1); + ctx.throw( + 500, + "An error occurred during translation. " + + "Please check translation with the Zotero client.", + { expose: true } + ); } + + // Translation can return multiple items (e.g., a parent item and notes pointing to it), + // so we have to return an array with keyed items + var newItems = []; + items.forEach(item => { + newItems.push(...Zotero.Utilities.itemToAPIJSON(item)); + }); + + ctx.response.body = newItems; } }; diff --git a/src/server.js b/src/server.js index 8ee41e4..6ede225 100644 --- a/src/server.js +++ b/src/server.js @@ -33,9 +33,11 @@ require('./zotero'); const Debug = require('./debug'); const Translators = require('./translators'); const SearchEndpoint = require('./searchEndpoint'); +const WebEndpoint = require('./webEndpoint'); const app = module.exports = new Koa(); app.use(bodyParser({ enableTypes: ['text', 'json']})); +app.use(_.post('/web', WebEndpoint.handle.bind(WebEndpoint))); app.use(_.post('/search', SearchEndpoint.handle.bind(SearchEndpoint))); Debug.init(1); diff --git a/src/textSearch.js b/src/textSearch.js new file mode 100644 index 0000000..90dcdb3 --- /dev/null +++ b/src/textSearch.js @@ -0,0 +1,505 @@ +/* + ***** BEGIN LICENSE BLOCK ***** + + Copyright © 2018 Center for History and New Media + George Mason University, Fairfax, Virginia, USA + https://www.zotero.org + + This file is part of Zotero. + + Zotero is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Zotero is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with Zotero. If not, see . + + ***** END LICENSE BLOCK ***** +*/ + +const config = require('config'); +const HTTP = require('./http'); + +module.exports = { + /** + * Handle text search + * + * @return {Promise} + */ + handle: async function (ctx, next) { + // If identifier-search server isn't available, return 501 + if (!config.has('identifierSearchURL') || !config.get("identifierSearchURL")) { + ctx.throw(501, "No identifiers found", { expose: true }); + } + + var data = ctx.request.body; + + var result = await search( + data, + ctx.query && ctx.query.start + ); + + // Throw selection if two or more items are found, or the selection flag is marked + if (result.items.length >= 2 || result.items.length >= 1 && result.select) { + let newItems = {}; + + for (let item of result.items) { + let DOI = item.DOI; + let ISBN = item.ISBN; + + if (!DOI && item.extra) { + let m = item.extra.match(/DOI: (.*)/); + if (m) DOI = m[1]; + } + + if (!ISBN && item.extra) { + let m = item.extra.match(/ISBN: (.*)/); + if (m) ISBN = m[1]; + } + + let identifier; + // DOI has a priority over ISBN for items that have both + if (DOI) { + identifier = DOI; + } + else if (item.ISBN) { + identifier = ISBN.split(' ')[0]; + } + + newItems[identifier] = { + itemType: item.itemType, + title: item.title, + description: formatDescription(item), + }; + } + + let headers = {}; + // If there were more results, include a link to the next result set + if (result.next) { + headers.Link = `; rel="next"`; + } + ctx.response.status = 300; + ctx.response.headers = headers; + + + // + // TODO: Differentiate from web request 300 + // + + + ctx.response.body = newItems; + return; + } + + if (result.items.length === 1) { + ctx.response.body = Zotero.Utilities.itemToAPIJSON(result.items[0]); + return; + } + + ctx.response.body = []; + } +}; + + +async function search(query, start) { + const numResults = 3; + let identifiers; + let moreResults = false; + try { + let xmlhttp = await HTTP.request( + "GET", + config.get("identifierSearchURL") + encodeURIComponent(query), + { + timeout: 15000 + } + ); + identifiers = JSON.parse(xmlhttp.responseText); + + // If passed a start= parameter, skip ahead + let startPos = 0; + if (start) { + for (let i = 0; i < identifiers.length; i++) { + if (identifierToToken(identifiers[i]) == start) { + startPos = i + 1; + break; + } + } + } + + if (identifiers.length > startPos + numResults + 1) { + moreResults = true; + } + + identifiers = identifiers.slice(startPos); + } catch(e) { + Zotero.debug(e, 1); + return {select: false, items: []}; + } + + let items = []; + let nextLastIdentifier = null; + for (let identifier of identifiers) { + let translate = new Zotero.Translate.Search(); + try { + translate.setIdentifier(identifier); + let translators = await translate.getTranslators(); + if (!translators.length) { + continue; + } + translate.setTranslator(translators); + + let newItems = await translate.translate({ + libraryID: false + }); + + if (newItems.length) { + let seq = getLongestCommonSequence(newItems[0].title, query); + if (seq.length >= 6 && seq.split(' ').length >= 2) { + items.push(newItems[0]); + // Keep track of last identifier if we're limiting results + if (moreResults) { + nextLastIdentifier = identifier; + } + if (items.length == numResults) { + break; + } + } + } + } + catch (e) { + if (e !== translate.ERROR_NO_RESULTS) { + Zotero.debug(e, 1); + } + } + } + + return { + // Force item selection, even for a single item + select: true, + items, + next: nextLastIdentifier ? identifierToToken(nextLastIdentifier) : null + }; + + // // Query Crossref and LoC/GBV in parallel to respond faster to the client + // let [crossrefItems, libraryItems] = await Promise.all([queryCrossref(query), queryLibraries(query)]); + // + // // Subtract book reviews from Crossref + // crossrefItems = subtractCrossrefItems(crossrefItems, libraryItems); + // + // let items = crossrefItems.concat(libraryItems); + // + // // Filter out too fuzzy items, by comparing item title (and other metadata) against query + // return await filterResults(items, query); +} + + +function formatDescription(item) { + let parts = []; + + let authors = []; + for (let creator of item.creators) { + if (creator.creatorType === 'author' && creator.lastName) { + authors.push(creator.lastName); + if (authors.length === 3) break; + } + } + + if(authors.length) parts.push(authors.join(', ')); + + if (item.date) { + let m = item.date.toString().match(/[0-9]{4}/); + if (m) parts.push(m[0]); + } + + if(item.publicationTitle) { + parts.push(item.publicationTitle); + } else if(item.publisher) { + parts.push(item.publisher); + } + + return parts.join(' \u2013 '); +} + + +function subtractCrossrefItems(crossrefItems, libraryItems) { + let items = []; + for(let crossrefItem of crossrefItems) { + // Keep books and book sections + if(['book', 'bookSection'].includes(crossrefItem.itemType)) { + items.push(crossrefItem); + continue; + } + + let crossrefTitle = crossrefItem.title; + // Remove all tags + crossrefTitle = crossrefTitle.replace(/<\/?\w+[^<>]*>/gi, ''); + crossrefTitle = crossrefTitle.replace(/:/g, ' '); + + // Normalize title, split to words, filter out empty array elements + crossrefTitle = normalize(crossrefTitle).split(' ').filter(x => x).join(' '); + + let found = false; + for(let libraryItem of libraryItems) { + let libraryTitle = libraryItem.title; + // Remove all tags + libraryTitle = libraryTitle.replace(/<\/?\w+[^<>]*>/gi, ''); + libraryTitle = libraryTitle.replace(/:/g, ' '); + + // Normalize title, split to words, filter out empty array elements + libraryTitle = normalize(libraryTitle).split(' ').filter(x => x).join(' '); + + if(crossrefTitle.includes(libraryTitle)) { + found = true; + break; + } + } + + if(!found) { + items.push(crossrefItem); + } + } + + return items; +} + +async function queryCrossref(query) { + let items = []; + try { + let translate = new Zotero.Translate.Search(); + // Crossref REST + translate.setTranslator("0a61e167-de9a-4f93-a68a-628b48855909"); + translate.setSearch({query}); + items = await translate.translate({libraryID: false}); + } + catch (e) { + Zotero.debug(e, 2); + } + return items; +} + +/** + * Queries LoC and if that fails, queries GBV + */ +async function queryLibraries(query) { + let items = []; + try { + let translate = new Zotero.Translate.Search(); + // Library of Congress ISBN + translate.setTranslator("c070e5a2-4bfd-44bb-9b3c-4be20c50d0d9"); + translate.setSearch({query}); + items = await translate.translate({libraryID: false}); + } + catch (e) { + Zotero.debug(e, 2); + try { + let translate = new Zotero.Translate.Search(); + // Gemeinsamer Bibliotheksverbund ISBN + translate.setTranslator("de0eef58-cb39-4410-ada0-6b39f43383f9"); + translate.setSearch({query}); + items = await translate.translate({libraryID: false}); + } + catch (e) { + Zotero.debug(e, 2); + } + } + return items; +} + +/** + * Decomposes all accents and ligatures, + * filters out symbols that aren't space or alphanumeric, + * and lowercases alphabetic symbols. + */ +function normalize(text) { + let rx = XRegExp('[^\\pL 0-9]', 'g'); + text = XRegExp.replace(text, rx, ''); + text = text.normalize('NFKD'); + text = XRegExp.replace(text, rx, ''); + text = text.toLowerCase(); + return text; +} + +/** + * Checks if a given word equals to any of the authors' names + */ +function hasAuthor(authors, word) { + return authors.some(author => { + return (author.firstName && normalize(author.firstName).split(' ').includes(word)) + || (author.lastName && normalize(author.lastName).split(' ').includes(word)); + }); +} + +/** + * Tries to find the longest common words sequence between + * item title and query text. Query text must include title (or part of it) + * from the beginning. If there are leftover query words, it tries to + * validate them against item metadata (currently only authors and year) + */ +async function filterResults(items, query) { + let filteredItems = []; + let select = false; + + // Normalize query, split to words, filter out empty array elements + let queryWords = normalize(query).split(' ').filter(x => x); + + for (let item of items) { + let DOI = item.DOI; + let ISBN = item.ISBN; + + if (!DOI && item.extra) { + let m = item.extra.match(/DOI: (.*)/); + if (m) DOI = m[1]; + } + + if (!ISBN && item.extra) { + let m = item.extra.match(/ISBN: (.*)/); + if (m) ISBN = m[1]; + } + + if (!DOI && !ISBN) continue; + let title = item.title; + // Remove all tags + title = title.replace(/<\/?\w+[^<>]*>/gi, ''); + title = title.replace(/:/g, ' '); + + // Normalize title, split to words, filter out empty array elements + let titleWords = normalize(title).split(' ').filter(x => x); + + let longestFrom = 0; + let longestLen = 0; + + // Finds the longest common words sequence between query text and item.title + for (let i = 0; i < queryWords.length; i++) { + for (let j = queryWords.length; j > 0; j--) { + let a = queryWords.slice(i, j); + for (let k = 0; k < titleWords.length - a.length + 1; k++) { + let b = titleWords.slice(k, a.length + k); + if (a.length && b.length && a.join(' ') === b.join(' ')) { + if (a.length > longestLen) { + longestFrom = i; + longestLen = b.length; + } + } + } + } + } + + // At least two common words sequence must be found between query and title + if (longestLen < 1) continue; + + // Longest common sequence of words + let foundPart = queryWords.slice(longestFrom, longestLen); + + // Remaining words + let rems = queryWords.slice(0, longestFrom); + rems = rems.concat(queryWords.slice(longestLen)); + + // If at least one remaining word is left, it tries to compare it against item metadata. + // Otherwise the whole query text is found in the title, and we have a full match + if (rems.length) { + let foundAuthor = false; + let needYear = false; + let foundYear = false; + + // Still remaining words + let rems2 = []; + + for (let rem of rems) { + // Ignore words + if (['the', 'a', 'an'].indexOf(rem) >= 0) continue; + + // If the remaining word has at least 2 chars and exists in metadata authors + if (rem.length >= 2 && hasAuthor(item.creators, rem)) { + foundAuthor = true; + continue; + } + + // If the remaining word is a 4 digit number (year) + if (/^[0-9]{4}$/.test(rem)) { + needYear = true; + + if (item.date) { + // If the remaining word exists in the item date + let m = item.date.toString().match(/[0-9]{4}/); + if (m && m[0] === rem) { + foundYear = true; + continue; + } + } + } + + // Push the word that is still remaining + rems2.push(rem); + } + + // If a year exists in the query, but is not matched to the item date + if (needYear && !foundYear) continue; + + // If there are still remaining words and none of authors are found + if (rems2.length && !foundAuthor) continue; + } + + // If the query part that was found in title is shorter than 30 symbols + if (foundPart.join(' ').length < 30) select = true; + + filteredItems.push({ + matchedLen: foundPart.join(' ').length, + titleLen: titleWords.join(' ').length, + item + }); + } + + // Sort results by matched text length + // and how close the matched text length is to title length + filteredItems.sort(function (a, b) { + if (b.matchedLen < a.matchedLen) return -1; + if (b.matchedLen > a.matchedLen) return 1; + return Math.abs(a.matchedLen - a.titleLen) - Math.abs(b.matchedLen - b.titleLen); + }); + + filteredItems = filteredItems.map(item => item.item); + + return {select, items: filteredItems}; +} + +function getLongestCommonSequence(title, query) { + title = title.replace(/<\/?\w+[^<>]*>/gi, ''); + title = title.replace(/:/g, ' '); + + query = query.replace(/:/g, ' '); + + // Normalize, split to words and filter out empty array elements + let titleWords = normalize(title).split(' ').filter(x => x); + let queryWords = normalize(query).split(' ').filter(x => x); + + let longestFrom = 0; + let longestLen = 0; + + // Finds the longest common words sequence between query text and item.title + for (let i = 0; i < queryWords.length; i++) { + for (let j = queryWords.length; j > 0; j--) { + let a = queryWords.slice(i, j); + for (let k = 0; k < titleWords.length - a.length + 1; k++) { + let b = titleWords.slice(k, a.length + k); + if (a.length && b.length && a.join(' ') === b.join(' ')) { + if (a.length > longestLen) { + longestFrom = i; + longestLen = b.length; + } + } + } + } + } + + return queryWords.slice(longestFrom, longestFrom + longestLen).join(' '); +} + +function identifierToToken(identifier) { + return Zotero.Utilities.Internal.md5(JSON.stringify(identifier)); +} diff --git a/src/webEndpoint.js b/src/webEndpoint.js new file mode 100644 index 0000000..d79f43e --- /dev/null +++ b/src/webEndpoint.js @@ -0,0 +1,97 @@ +/* + ***** BEGIN LICENSE BLOCK ***** + + Copyright © 2018 Corporation for Digital Scholarship + Vienna, Virginia, USA + https://www.zotero.org + + This file is part of Zotero. + + Zotero is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Zotero is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with Zotero. If not, see . + + ***** END LICENSE BLOCK ***** +*/ + +const WebSession = require('./webSession'); + +// Timeout for select requests, in seconds +//const SELECT_TIMEOUT = 120; +const SELECT_TIMEOUT = 15; +const sessionsWaitingForSelection = {}; + +var requestsSinceGC = 0; + + +var WebEndpoint = module.exports = { + handle: async function (ctx, next) { + ctx.assert(ctx.is('text/plain') || ctx.is('json'), 415); + + setTimeout(() => { + gc(); + }); + + var data = ctx.request.body; + + if (!data) { + ctx.throw(400, "POST data not provided\n"); + } + + // If follow-up URL request, retrieve session and update context + var query; + var session; + if (typeof data == 'object') { + let sessionID = data.session; + if (!sessionID) { + ctx.throw(400, "'session' not provided"); + } + session = sessionsWaitingForSelection[sessionID]; + if (!session) { + ctx.throw(400, "Session not found"); + } + delete sessionsWaitingForSelection[sessionID]; + session.ctx = ctx; + session.next = next; + session.data = data; + } + else { + session = new WebSession(ctx, next, data); + } + + if (typeof data != 'object' && !data.match(/^https?:/)) { + ctx.throw(400, "URL not provided"); + } + + await session.handleURL(); + + // Store session if returning multiple choices + if (ctx.response.status == 300) { + sessionsWaitingForSelection[session.id] = session; + } + } +}; + +/** + * Perform garbage collection every 10 requests + */ +function gc() { + if ((++requestsSinceGC) == 3) { + for (let i in sessionsWaitingForSelection) { + let session = sessionsWaitingForSelection[i]; + if (session.started && Date.now() >= session.started + SELECT_TIMEOUT * 1000) { + delete sessionsWaitingForSelection[i]; + } + } + requestsSinceGC = 0; + } +} diff --git a/src/searchSession.js b/src/webSession.js similarity index 98% rename from src/searchSession.js rename to src/webSession.js index 775dad6..5dfe62d 100644 --- a/src/searchSession.js +++ b/src/webSession.js @@ -28,6 +28,7 @@ const urlLib = require('url'); const Translate = require('./translation/translate'); const HTTP = require('./http'); const Translators = require('./translators'); +const SearchEndpoint = require('./searchEndpoint'); const SERVER_TRANSLATION_TIMEOUT = 30; @@ -66,14 +67,16 @@ SearchSession.prototype.handleURL = async function () { if (!doi) { this.ctx.throw(500, "An error occurred retrieving the document\n"); } - return this.handleDOI(doi); + await SearchEndpoint.handleIdentifier(this.ctx, { DOI: doi }); + return; } } // If a doi.org URL, use search handler if (url.match(/^https?:\/\/[^\/]*doi\.org\//)) { let doi = Zotero.Utilities.cleanDOI(url); - return this.handleDOI(doi); + await SearchEndpoint.handleIdentifier(this.ctx, { DOI: doi }); + return; } var urlsToTry = config.get('deproxifyURLs') ? this.deproxifyURL(url) : [url]; @@ -331,12 +334,7 @@ SearchSession.prototype.selectDone = function () { };*/ -/** - * @return {Promise} - */ -SearchSession.prototype.handleDOI = async function (doi) { - this.ctx.throw(501); -}; + /** diff --git a/test/server_search_test.js b/test/server_search_test.js deleted file mode 100644 index 1f629a1..0000000 --- a/test/server_search_test.js +++ /dev/null @@ -1,68 +0,0 @@ -describe("/search", function () { - describe("URL", function () { - it("should translate a generic webpage", async function () { - var url = testURL + 'plain'; - var response = await request() - .post('/search') - .set('Content-Type', 'text/plain') - .send(url); - assert.equal(response.statusCode, 200); - var json = response.body; - - assert.lengthOf(json, 1); - assert.equal(json[0].itemType, 'webpage'); - assert.equal(json[0].title, 'Test'); - }); - - - it("should translate a webpage with embedded metadata", async function () { - var url = testURL + 'single'; - var response = await request() - .post('/search') - .set('Content-Type', 'text/plain') - .send(url); - assert.equal(response.statusCode, 200); - var json = response.body; - - assert.lengthOf(json, 1); - assert.equal(json[0].itemType, 'journalArticle'); - assert.equal(json[0].title, 'Title'); - }); - - - it("should return multiple results and perform follow-up translation", async function () { - var url = testURL + 'multiple'; - var response = await request() - .post('/search') - .set('Content-Type', 'text/plain') - .send(url); - assert.equal(response.statusCode, 300); - var json = response.body; - assert.equal(json.url, url); - assert.property(json, 'session'); - assert.deepEqual(json.items, { 0: 'A', 1: 'B', 2: 'C' }); - - delete json.items[1]; - - response = await request() - .post('/search') - .send(json); - assert.equal(response.statusCode, 200); - json = response.body; - assert.lengthOf(json, 2); - assert.equal(json[0].title, 'A'); - assert.equal(json[1].title, 'C'); - }); - - - it("should return 400 if a page returns a 404", async function () { - var url = testURL + '404'; - var response = await request() - .post('/search') - .set('Content-Type', 'text/plain') - .send(url); - assert.equal(response.statusCode, 400); - assert.equal(response.text, 'Remote page not found'); - }); - }); -}); diff --git a/test/web_test.js b/test/web_test.js new file mode 100644 index 0000000..243a2ed --- /dev/null +++ b/test/web_test.js @@ -0,0 +1,66 @@ +describe("/web", function () { + it("should translate a generic webpage", async function () { + var url = testURL + 'plain'; + var response = await request() + .post('/web') + .set('Content-Type', 'text/plain') + .send(url); + assert.equal(response.statusCode, 200); + var json = response.body; + + assert.lengthOf(json, 1); + assert.equal(json[0].itemType, 'webpage'); + assert.equal(json[0].title, 'Test'); + }); + + + it("should translate a webpage with embedded metadata", async function () { + var url = testURL + 'single'; + var response = await request() + .post('/web') + .set('Content-Type', 'text/plain') + .send(url); + assert.equal(response.statusCode, 200); + var json = response.body; + + assert.lengthOf(json, 1); + assert.equal(json[0].itemType, 'journalArticle'); + assert.equal(json[0].title, 'Title'); + }); + + + it("should return multiple results and perform follow-up translation", async function () { + var url = testURL + 'multiple'; + var response = await request() + .post('/web') + .set('Content-Type', 'text/plain') + .send(url); + assert.equal(response.statusCode, 300); + var json = response.body; + assert.equal(json.url, url); + assert.property(json, 'session'); + assert.deepEqual(json.items, { 0: 'A', 1: 'B', 2: 'C' }); + + delete json.items[1]; + + response = await request() + .post('/web') + .send(json); + assert.equal(response.statusCode, 200); + json = response.body; + assert.lengthOf(json, 2); + assert.equal(json[0].title, 'A'); + assert.equal(json[1].title, 'C'); + }); + + + it("should return 400 if a page returns a 404", async function () { + var url = testURL + '404'; + var response = await request() + .post('/web') + .set('Content-Type', 'text/plain') + .send(url); + assert.equal(response.statusCode, 400); + assert.equal(response.text, 'Remote page not found'); + }); +}); diff --git a/translate_search b/translate_search new file mode 100755 index 0000000..3853e5e --- /dev/null +++ b/translate_search @@ -0,0 +1,9 @@ +#!/bin/bash + +URL="$1" +if [ -z "$1" ]; then + echo "Usage: $0 identifier-or-phrase" + exit 1 +fi + +curl -v -d "$URL" -H "Content-Type: text/plain" 127.0.0.1:1969/search diff --git a/translate_url b/translate_url index 646002c..b0f2011 100755 --- a/translate_url +++ b/translate_url @@ -6,4 +6,4 @@ if [ -z "$1" ]; then exit 1 fi -curl -v -d "$URL" -H "Content-Type: text/plain" 127.0.0.1:1969/search +curl -v -d "$URL" -H "Content-Type: text/plain" 127.0.0.1:1969/web diff --git a/translate_url_multiple b/translate_url_multiple index c1b13c6..1227580 100755 --- a/translate_url_multiple +++ b/translate_url_multiple @@ -6,4 +6,4 @@ if [ -z "$1" ]; then exit 1 fi -curl -v -d "$URL" -H "Content-Type: text/plain" 127.0.0.1:1969/search | jq '{ url: .url, session: .session, items: .items | to_entries | [.[0]] | from_entries }' | curl -v -d @- -H "Content-Type: application/json" 127.0.0.1:1969/search +curl -v -d "$URL" -H "Content-Type: text/plain" 127.0.0.1:1969/web | jq '{ url: .url, session: .session, items: .items | to_entries | [.[0]] | from_entries }' | curl -v -d @- -H "Content-Type: application/json" 127.0.0.1:1969/web