Skip to content

Commit

Permalink
Merge pull request #218 from europeana/MET-1778
Browse files Browse the repository at this point in the history
Met 1778
  • Loading branch information
tarekkh authored Mar 18, 2019
2 parents 383df74 + e1ef878 commit ea19728
Show file tree
Hide file tree
Showing 6 changed files with 130 additions and 90 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@
import javax.xml.xpath.XPathFactory;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import java.io.StringWriter;

import static eu.europeana.cloud.service.dps.PluginParameterKeys.DPS_TASK_INPUT_DATA;
import static eu.europeana.cloud.service.dps.PluginParameterKeys.CLOUD_LOCAL_IDENTIFIER;
Expand Down Expand Up @@ -102,7 +100,9 @@ private void useEuropeanaId(StormTaskTuple stormTaskTuple) throws EuropeanaIdExc

@Override
public void prepare() {

harvester = new Harvester();

try {
XPath xpath = XPathFactory.newInstance().newXPath();
expr = xpath.compile(METADATA_XPATH);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
package eu.europeana.cloud.service.dps.storm.topologies.oaipmh.bolt.harvester;

import org.apache.commons.io.IOUtils;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClientBuilder;
import org.dspace.xoai.serviceprovider.exceptions.HttpException;
import org.dspace.xoai.serviceprovider.parameters.Parameters;

import java.io.IOException;
import java.io.InputStream;
import java.util.concurrent.TimeUnit;


/**
* Created by Tarek on 3/7/2019.
*/
public class CustomConnection {
private String baseUrl;
private HttpClient httpclient;
private int timeout = '\uea60';

public CustomConnection(String baseUrl) {
this.baseUrl = baseUrl;
this.httpclient = HttpClientBuilder.create().setConnectionTimeToLive(timeout, TimeUnit.SECONDS).build();
}

public String execute(Parameters parameters) throws HttpException {
HttpGet httpGet = null;
try {
httpGet = this.createGetRequest(parameters);
HttpResponse response = this.httpclient.execute(httpGet);
if (response.getStatusLine().getStatusCode() == 200) {
return getData(response.getEntity().getContent());
} else {
throw new HttpException("Error querying service. Returned HTTP Status Code: " + response.getStatusLine().getStatusCode());
}
} catch (IOException exception) {
throw new HttpException(exception);
} finally {
if (httpGet != null)
httpGet.releaseConnection();
}
}


private String getData(InputStream is) throws IOException {

return IOUtils.toString(is, "UTF-8");
}

private HttpGet createGetRequest(Parameters parameters) {
return new HttpGet(parameters.toUrl(this.baseUrl));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import org.dspace.xoai.serviceprovider.parameters.Parameters;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.InputSource;

import javax.xml.xpath.XPathExpression;
import java.io.IOException;
Expand All @@ -21,7 +22,7 @@
*/
public class Harvester implements Serializable {
private static final Logger LOGGER = LoggerFactory.getLogger(Harvester.class);
private static final int DEFAULT_RETRIES=3;
private static final int DEFAULT_RETRIES = 3;


/**
Expand All @@ -30,8 +31,7 @@ public class Harvester implements Serializable {
* @param oaiPmhEndpoint OAI-PMH endpoint
* @param recordId record id
* @param metadataPrefix metadata prefix
* @param expression XPATH expression
* @param expression XPATH expression
* @return return metadata
* @throws HarvesterException
* @throws IOException
Expand All @@ -43,8 +43,9 @@ public InputStream harvestRecord(String oaiPmhEndpoint, String recordId, String
GetRecordParameters params = new GetRecordParameters().withIdentifier(recordId).withMetadataFormatPrefix(metadataPrefix);
int retries = DEFAULT_RETRIES;
while (true) {
OAIClient client = new HttpOAIClient(oaiPmhEndpoint);
try (final InputStream record = client.execute(Parameters.parameters().withVerb(Verb.Type.GetRecord).include(params))) {
CustomConnection client = new CustomConnection(oaiPmhEndpoint);
try {
String record = client.execute(Parameters.parameters().withVerb(Verb.Type.GetRecord).include(params));
return new XmlXPath(record).xpath(expression);
} catch (Exception e) {
if (retries-- > 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,20 +12,17 @@
import javax.xml.transform.sax.SAXSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.*;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.*;

/**
* Class xpath on XML passed as {@code InputStream}.
*
* @author krystian.
*/
class XmlXPath {
private final InputStream input;
private String input;

XmlXPath(InputStream input) {
XmlXPath(String input) {
this.input = input;
}

Expand All @@ -38,17 +35,13 @@ class XmlXPath {
*/
public InputStream xpath(XPathExpression expr) throws HarvesterException, IOException {
try {
final InputSource inputSource = new SAXSource(new InputSource(input)).getInputSource();
final InputSource inputSource = new SAXSource(new InputSource(new StringReader(input))).getInputSource();
final NodeList result = (NodeList) expr.evaluate(inputSource,
XPathConstants.NODESET);

return convertToStream(result);
} catch (XPathExpressionException | TransformerException e) {
throw new HarvesterException("Cannot xpath XML!", e);
} finally {
if (input != null) {
input.close();
}
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
package eu.europeana.cloud.service.dps.storm.topologies.oaipmh;

import eu.europeana.cloud.service.dps.storm.topologies.oaipmh.bolt.harvester.CustomConnection;
import eu.europeana.cloud.service.dps.storm.topologies.oaipmh.common.OAIHelper;
import eu.europeana.cloud.service.dps.storm.topologies.oaipmh.helpers.SourceProvider;
import org.apache.commons.io.IOUtils;
import org.dspace.xoai.model.oaipmh.Header;
import org.dspace.xoai.model.oaipmh.Verb;
import org.dspace.xoai.serviceprovider.client.HttpOAIClient;
import org.dspace.xoai.serviceprovider.client.OAIClient;
import org.dspace.xoai.serviceprovider.exceptions.OAIRequestException;
import org.dspace.xoai.serviceprovider.parameters.GetRecordParameters;
import org.dspace.xoai.serviceprovider.parameters.ListIdentifiersParameters;
Expand All @@ -16,15 +14,10 @@
import org.junit.runners.Parameterized;
import org.junit.runners.Parameterized.Parameters;

import java.io.InputStream;
import java.util.Arrays;
import java.util.Collection;
import java.util.Iterator;

import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.isEmptyString;
import static org.hamcrest.core.IsNot.not;

@Ignore
@RunWith(Parameterized.class)
public class RealRepositoriesHarvestingTest {
Expand All @@ -37,9 +30,9 @@ public class RealRepositoriesHarvestingTest {
@Parameters
public static Collection<Object[]> data() {
Object[][] data = new Object[][]{
{"http://islandskort.is/oai", "edm",""},
{"http://baekur.is/oai", "edm",""},
{"http://adl2.kb.dk/oai", "ese",""}, //cannot harvest records - GetRecord method doesn't work, identifiers like "/hertz11_lev"
{"http://islandskort.is/oai", "edm", ""},
{"http://baekur.is/oai", "edm", ""},
{"http://adl2.kb.dk/oai", "ese", ""}, //cannot harvest records - GetRecord method doesn't work, identifiers like "/hertz11_lev"
{"http://panic.image.ece.ntua.gr:9000/photography/oai", "rdf", ""}, //extremely slow - might not work
{"http://oai-02.kb.dk/oai/provider", "ese", ""},
{"http://oai.kb.dk/oai/provider", "ese", ""},
Expand All @@ -53,40 +46,40 @@ public static Collection<Object[]> data() {
{"http://data.museums.eu/oai/OAIHandler", "edm", "galerija_bozidar_jakac_-_kostanjevica_na_krki"},
{"http://digitalna.nsk.hr/oai", "ese", "TEL"},
{"http://digitool.bibnat.ro/OAI-PUB/", "oai_dc", ""},
{"http://diglit.ub.uni-heidelberg.de/cgi-bin/digioai.cgi", "mets","druckschriften"},
{"http://dlib.mk/oai/request", "oai_dc","com_68275_66"},
{"http://europeana.cbuc.cat/oai.php", "ese","manuscritAB"},
{"http://formula1.csc.fi/oai-pmh", "edm","musketti"},
{"http://haw.nsk.hr/oai/", "oai_dc",""},
{"http://kobson.nb.rs/oaiphd/oai2.aspx", "oai_dc","phd"},
{"http://more.locloud.eu:8080/carare", "EDM","SNHB"},
{"http://more.locloud.eu:8080/locloud", "EDM","IPCHS"},
{"http://museion.esbirky.cz/w4m/oai/MRK", "ese","SMUH"},
{"http://nukoai.nuk.uni-lj.si:8089/repox/OAIHandler", "edm","Rise_of_Literacy_in_Europe_part_1"},
{"http://oai-bdb.onb.ac.at/Script/oai2.aspx", "ese","9846792"},
{"http://oaipmh.lnb.lv/repox/OAIHandler", "ese","zlpics"},
{"http://repox.kamra.si:8080/repox/OAIHandler", "edm","Europeana_Kamra_Multimedia"},
{"http://stary.webumenia.sk/oai-pmh", "ese","Europeana SNG"},
{"http://test117.ait.co.at/oai-provider-edm/oai/", "edm","MNHN"},
{"http://timarit.is/oai", "edm","publications"},
{"http://www.kulturpool.at/rest/export/1.0/oai-pmh", "edm","oeaw"},
{"http://www.manuscriptorium.com/oai/", "ese","digitized"},
{"https://aggregator.ekt.gr/aggregator-oai/request", "edm","Digital_Pylia"},
{"https://cdk.lib.cas.cz/oai", "ese",""},
{"https://dr.nsk.hr/oai", "oai_dc",""},
{"https://handrit.is/oai", "edm","europeana"},
{"https://node0-d-efg.d4science.org/efg/mvc/oai/oai.do","edm","dfi"},
{"https://polona.pl/oai/OAIHandler","oai_dc","freeAccess"},
{"https://rep.nlg.gr/oai/request","edm","col_20.500.11781_30709"},
{"http://dspace.bcucluj.ro/oai/request","oai_dc","com_123456789_12911"},
{"http://digital-library.ulbsibiu.ro/oai/request","oai_dc","hdl_123456789_384"},
{"http://gutenberg.beic.it/OAI-PUB","oai_dc","oai"},
{"http://www.doabooks.org/oai","oai_dc",""},
{"http://www.doabooks.org/oai","oai_dc",""},
{"http://www.moldavica.bnrm.md/biblielmo/oaiserver","oai_dc","cartipost"},
{"https://mint-monitor.socialhistoryportal.org/ialhi/oai","rdf","1001"},
{"https://www.rijksmuseum.nl/api2/oai/8zmxuaJ2","europeana_edm",""},
{"http://dare.uva.nl/cgi/arno/oai/oapen","ese",""}
{"http://diglit.ub.uni-heidelberg.de/cgi-bin/digioai.cgi", "mets", "druckschriften"},
{"http://dlib.mk/oai/request", "oai_dc", "com_68275_66"},
{"http://europeana.cbuc.cat/oai.php", "ese", "manuscritAB"},
{"http://formula1.csc.fi/oai-pmh", "edm", "musketti"},
{"http://haw.nsk.hr/oai/", "oai_dc", ""},
{"http://kobson.nb.rs/oaiphd/oai2.aspx", "oai_dc", "phd"},
{"http://more.locloud.eu:8080/carare", "EDM", "SNHB"},
{"http://more.locloud.eu:8080/locloud", "EDM", "IPCHS"},
{"http://museion.esbirky.cz/w4m/oai/MRK", "ese", "SMUH"},
{"http://nukoai.nuk.uni-lj.si:8089/repox/OAIHandler", "edm", "Rise_of_Literacy_in_Europe_part_1"},
{"http://oai-bdb.onb.ac.at/Script/oai2.aspx", "ese", "9846792"},
{"http://oaipmh.lnb.lv/repox/OAIHandler", "ese", "zlpics"},
{"http://repox.kamra.si:8080/repox/OAIHandler", "edm", "Europeana_Kamra_Multimedia"},
{"http://stary.webumenia.sk/oai-pmh", "ese", "Europeana SNG"},
{"http://test117.ait.co.at/oai-provider-edm/oai/", "edm", "MNHN"},
{"http://timarit.is/oai", "edm", "publications"},
{"http://www.kulturpool.at/rest/export/1.0/oai-pmh", "edm", "oeaw"},
{"http://www.manuscriptorium.com/oai/", "ese", "digitized"},
{"https://aggregator.ekt.gr/aggregator-oai/request", "edm", "Digital_Pylia"},
{"https://cdk.lib.cas.cz/oai", "ese", ""},
{"https://dr.nsk.hr/oai", "oai_dc", ""},
{"https://handrit.is/oai", "edm", "europeana"},
{"https://node0-d-efg.d4science.org/efg/mvc/oai/oai.do", "edm", "dfi"},
{"https://polona.pl/oai/OAIHandler", "oai_dc", "freeAccess"},
{"https://rep.nlg.gr/oai/request", "edm", "col_20.500.11781_30709"},
{"http://dspace.bcucluj.ro/oai/request", "oai_dc", "com_123456789_12911"},
{"http://digital-library.ulbsibiu.ro/oai/request", "oai_dc", "hdl_123456789_384"},
{"http://gutenberg.beic.it/OAI-PUB", "oai_dc", "oai"},
{"http://www.doabooks.org/oai", "oai_dc", ""},
{"http://www.doabooks.org/oai", "oai_dc", ""},
{"http://www.moldavica.bnrm.md/biblielmo/oaiserver", "oai_dc", "cartipost"},
{"https://mint-monitor.socialhistoryportal.org/ialhi/oai", "rdf", "1001"},
{"https://www.rijksmuseum.nl/api2/oai/8zmxuaJ2", "europeana_edm", ""},
{"http://dare.uva.nl/cgi/arno/oai/oapen", "ese", ""}
};
return Arrays.asList(data);
}
Expand All @@ -103,12 +96,12 @@ public void shouldListIdentifiersWithoutSetSpecSpecified() throws Exception {

ListIdentifiersParameters parameters = new ListIdentifiersParameters();
parameters.withMetadataPrefix(schema);
if(!set.isEmpty())
if (!set.isEmpty())
parameters.withSetSpec(set);
System.out.println("Harvesting " + endpoint);
int i = 0;
Iterator<Header> headerIterator = sourceProvider.provide(endpoint).listIdentifiers(parameters);
while (headerIterator.hasNext() ) { //should go over few pages, as they usually have 50 headers per page
while (headerIterator.hasNext()) { //should go over few pages, as they usually have 50 headers per page
Header header = headerIterator.next();
System.out.println(i + " " + header.getIdentifier());
++i;
Expand All @@ -122,46 +115,41 @@ public void shouldListIdentifiersAndGetRecordsWithoutSetSpecSpecified() throws E

ListIdentifiersParameters parameters = new ListIdentifiersParameters();
parameters.withMetadataPrefix(schema);
if(!set.isEmpty())
if (!set.isEmpty())
parameters.withSetSpec(set);
System.out.println("Harvesting " + endpoint);
int i = 0;
Iterator<Header> headerIterator = sourceProvider.provide(endpoint).listIdentifiers(parameters);
while (headerIterator.hasNext() && i < 110) { //should go over few pages, as they usually have 50 headers per page
Header header = headerIterator.next();
InputStream inputStream = harvestRecord(endpoint, header.getIdentifier(), schema);
String record = IOUtils.toString(inputStream, "UTF-8");
assertThat(record, not(isEmptyString()));
String record = harvestRecord(endpoint, header.getIdentifier(), schema);
System.out.println(record);
IOUtils.closeQuietly(inputStream);

i++;
}
System.out.println(i + " records harvested");
//then should not throw any xml parse exception
}

@Test
public void shouldHarvestRecord() throws Exception{
InputStream inputStream = harvestRecord("http://kobson.nb.rs/oaiphd/oai2.aspx","oai:doiphd:BG20120531LAZAREVICPASTI","oai_dc");
String record = IOUtils.toString(inputStream, "UTF-8");
assertThat(record, not(isEmptyString()));
public void shouldHarvestRecord() throws Exception {
String record = harvestRecord("http://kobson.nb.rs/oaiphd/oai2.aspx", "oai:doiphd:BG20120531LAZAREVICPASTI", "oai_dc");
System.out.println(record);
IOUtils.closeQuietly(inputStream);
}


@Test
public void shouldReadGranularity(){
public void shouldReadGranularity() {
OAIHelper helper = new OAIHelper(endpoint);
helper.getGranularity();
}

//this is how we download records, copy-pasted for simplicity
private InputStream harvestRecord(String oaiPmhEndpoint, String recordId, String metadataPrefix)
private String harvestRecord(String oaiPmhEndpoint, String recordId, String metadataPrefix)
throws OAIRequestException {
GetRecordParameters params = new GetRecordParameters().withIdentifier(recordId).withMetadataFormatPrefix(metadataPrefix);
while (true) {
OAIClient client = new HttpOAIClient(oaiPmhEndpoint);
CustomConnection client = new CustomConnection(oaiPmhEndpoint);
return client.execute(org.dspace.xoai.serviceprovider.parameters.Parameters.parameters().withVerb(Verb.Type.GetRecord).include(params));
}
}
Expand Down
Loading

0 comments on commit ea19728

Please sign in to comment.