diff --git a/src/main/java/de/landsh/opendata/catalogproxy/CatalogFilter.java b/src/main/java/de/landsh/opendata/catalogproxy/CatalogFilter.java index a0024dd842f7230c044ab2a8e2ada3df2d8ff0d3..daababc4a235d8cdc809491d2a62d9936fb38d6e 100644 --- a/src/main/java/de/landsh/opendata/catalogproxy/CatalogFilter.java +++ b/src/main/java/de/landsh/opendata/catalogproxy/CatalogFilter.java @@ -6,6 +6,7 @@ import org.apache.jena.riot.RDFLanguages; import org.apache.jena.riot.RDFParser; import org.apache.jena.riot.system.ErrorHandlerFactory; import org.apache.jena.util.ResourceUtils; +import org.apache.jena.vocabulary.DCAT; import org.apache.jena.vocabulary.DCTerms; import org.apache.jena.vocabulary.RDF; import org.springframework.beans.factory.annotation.Value; @@ -35,7 +36,7 @@ public class CatalogFilter { } Model work(InputStream in) { - Model model = ModelFactory.createDefaultModel(); + final Model model = ModelFactory.createDefaultModel(); RDFParser.create() .source(in) @@ -44,16 +45,16 @@ public class CatalogFilter { .base(baseURL) .parse(model); - Set<String> usedDistributionIds = new HashSet<>(); + final Set<String> usedDistributionIds = new HashSet<>(); - final ResIterator it = model.listSubjectsWithProperty(RDF.type, ResourceFactory.createResource("http://www.w3.org/ns/dcat#Dataset")); + final ResIterator it = model.listSubjectsWithProperty(RDF.type, DCAT.Dataset); while (it.hasNext()) { Resource dataset = it.next(); if (hasAtLeastOneValidDistribution(dataset)) { usedDistributionIds.addAll(getDistributionsForDataset(dataset)); } else { model.remove(dataset.listProperties()); - model.remove(model.listStatements(null, ResourceFactory.createProperty("http://www.w3.org/ns/dcat#dataset"), dataset)); + model.remove(model.listStatements(null, DCAT.dataset, dataset)); } } @@ -62,19 +63,40 @@ public class CatalogFilter { removeUnusedLocations(model); minimizeLocations(model); rewriteHydraURLs(model); + addDownloadURLs(model); return model; } + /** + * Add downloadURL properties to Distributions. The German DCAT-AP.de treats downloadURL as an no so + * important optional properties and relies the the accessURL. However, the European data portal values the + * downloadURL property highly. + */ + void addDownloadURLs(Model model) { + final ResIterator it = model.listSubjectsWithProperty(RDF.type, DCAT.Distribution); + while (it.hasNext()) { + final Resource distribution = it.next(); + + final Resource accessURL = distribution.getPropertyResourceValue(DCAT.accessURL); + final Resource downloadURL = distribution.getPropertyResourceValue(DCAT.downloadURL); + + if( downloadURL == null ) { + distribution.addProperty(DCAT.downloadURL, accessURL); + } + + } + } + void rewriteHydraURLs(Model model) { final ResIterator it = model.listSubjectsWithProperty(RDF.type, ResourceFactory.createResource("http://www.w3.org/ns/hydra/core#PagedCollection")); if (it.hasNext()) { final Resource pagedCollection = it.nextResource(); final String originalURL = StringUtils.substringBefore(pagedCollection.getURI(), "catalog.xml"); - List<Statement> changeStatements = new ArrayList<>(); + final List<Statement> changeStatements = new ArrayList<>(); - StmtIterator iterator = pagedCollection.listProperties(); + final StmtIterator iterator = pagedCollection.listProperties(); while (iterator.hasNext()) { Statement stmt = iterator.next(); if (stmt.getObject().isLiteral()) { @@ -92,7 +114,6 @@ public class CatalogFilter { } ResourceUtils.renameResource(pagedCollection, pagedCollection.getURI().replaceFirst(originalURL, baseURL)); - } } @@ -130,7 +151,7 @@ public class CatalogFilter { void removeAnonymousResources(Model model) { final ResIterator it = model.listSubjects(); - Collection<Resource> allObjects = allObjects(model); + final Collection<Resource> allObjects = allObjects(model); while (it.hasNext()) { Resource resource = it.next(); @@ -141,8 +162,8 @@ public class CatalogFilter { } Collection<Resource> allObjects(Model model) { - Set<Resource> result = new HashSet<>(); - NodeIterator it = model.listObjects(); + final Set<Resource> result = new HashSet<>(); + final NodeIterator it = model.listObjects(); while (it.hasNext()) { RDFNode next = it.next(); if (next.isResource()) { @@ -156,9 +177,9 @@ public class CatalogFilter { * Entfernt aus dem Model alle dcat:Distribution Instanzen, deren URI nicht in der angegebenen Collection enthalten sind. */ void removeUnusedDistributions(Model model, Collection<String> usedDistributionIds) { - final ResIterator it = model.listSubjectsWithProperty(RDF.type, ResourceFactory.createResource("http://www.w3.org/ns/dcat#Distribution")); + final ResIterator it = model.listSubjectsWithProperty(RDF.type, DCAT.Distribution); while (it.hasNext()) { - Resource distribution = it.next(); + final Resource distribution = it.next(); if (!usedDistributionIds.contains(distribution.getURI())) { model.remove(distribution.listProperties()); } @@ -166,13 +187,11 @@ public class CatalogFilter { } Collection<String> getDistributionsForDataset(Resource dataset) { - Set<String> result = new HashSet<>(); - StmtIterator it = dataset.listProperties(ResourceFactory.createProperty("http://www.w3.org/ns/dcat#distribution")); + final Set<String> result = new HashSet<>(); + final StmtIterator it = dataset.listProperties(DCAT.distribution); while (it.hasNext()) { - Statement next = it.next(); - - Resource distribution = next.getObject().asResource(); - + final Statement next = it.next(); + final Resource distribution = next.getObject().asResource(); result.add(distribution.getURI()); } @@ -180,14 +199,14 @@ public class CatalogFilter { } boolean hasAtLeastOneValidDistribution(Resource dataset) { - StmtIterator it = dataset.listProperties(ResourceFactory.createProperty("http://www.w3.org/ns/dcat#distribution")); + final StmtIterator it = dataset.listProperties(ResourceFactory.createProperty("http://www.w3.org/ns/dcat#distribution")); boolean atLeastOneValidFormat = false; while (it.hasNext()) { - Statement next = it.next(); + final Statement next = it.next(); - Resource distribution = next.getObject().asResource(); - RDFNode format = distribution.getProperty(DCTerms.format).getObject(); + final Resource distribution = next.getObject().asResource(); + final RDFNode format = distribution.getProperty(DCTerms.format).getObject(); if (!UNWANTED_FORMATS.contains(format)) { atLeastOneValidFormat = true; } diff --git a/src/main/resources/logback.xml b/src/main/resources/logback.xml new file mode 100644 index 0000000000000000000000000000000000000000..4a352704107b66839cb2cd01d8e5ef499950fb80 --- /dev/null +++ b/src/main/resources/logback.xml @@ -0,0 +1,14 @@ +<configuration> + <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender"> + <encoder> + <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern> + </encoder> + </appender> + + <root level="WARN"> + <appender-ref ref="STDOUT" /> + </root> + + <logger name="de.landsh.opendata" level="DEBUG"/> + +</configuration> \ No newline at end of file diff --git a/src/test/java/de/landsh/opendata/catalogproxy/CatalogFilterTest.java b/src/test/java/de/landsh/opendata/catalogproxy/CatalogFilterTest.java index c8b1f8598d11eb952a470e94556e1f65a8605578..9b570bda331eccb069bc7e92efc6eae3d039c537 100644 --- a/src/test/java/de/landsh/opendata/catalogproxy/CatalogFilterTest.java +++ b/src/test/java/de/landsh/opendata/catalogproxy/CatalogFilterTest.java @@ -1,9 +1,13 @@ package de.landsh.opendata.catalogproxy; -import org.apache.jena.rdf.model.*; +import org.apache.jena.rdf.model.Model; +import org.apache.jena.rdf.model.ModelFactory; +import org.apache.jena.rdf.model.ResIterator; +import org.apache.jena.rdf.model.Resource; import org.apache.jena.riot.RDFLanguages; import org.apache.jena.riot.RDFParser; import org.apache.jena.riot.system.ErrorHandlerFactory; +import org.apache.jena.vocabulary.DCAT; import org.apache.jena.vocabulary.RDF; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeEach; @@ -13,6 +17,8 @@ import java.io.InputStream; import java.io.StringWriter; import java.util.Collections; +import static org.junit.jupiter.api.Assertions.*; + public class CatalogFilterTest { private final CatalogFilter catalogFilter = new CatalogFilter(); @@ -55,23 +61,62 @@ public class CatalogFilterTest { @Test public void removeUnusedResources_removeAll() { - Model model = parseRdf(getClass().getResourceAsStream("/catalog.xml")); + final Model model = parseRdf(getClass().getResourceAsStream("/catalog.xml")); catalogFilter.removeUnusedDistributions(model, Collections.emptySet()); - Assertions.assertEquals(0, countInstances(model, ResourceFactory.createResource("http://www.w3.org/ns/dcat#Distribution"))); + Assertions.assertEquals(0, countInstances(model, DCAT.Distribution)); } @Test public void rewriteHydraURLs() { - Model model = parseRdf(getClass().getResourceAsStream("/hydra.xml")); + final Model model = parseRdf(getClass().getResourceAsStream("/hydra.xml")); catalogFilter.rewriteHydraURLs(model); StringWriter sw = new StringWriter(); model.write(sw); final String result = sw.toString(); - Assertions.assertFalse(result.contains("http://opendata.schleswig-holstein.de/catalog.xml")); - Assertions.assertTrue(result.contains("https://example.org/catalog.xml?page=84")); + assertFalse(result.contains("http://opendata.schleswig-holstein.de/catalog.xml")); + assertTrue(result.contains("https://example.org/catalog.xml?page=84")); + } + + @Test + public void addDownloadURLs_will_add_accessURLs() { + final Model model = parseRdf(getClass().getResourceAsStream("/catalog.xml")); + + catalogFilter.addDownloadURLs(model); + + // Every distribution has a downloadURL with the same value as the accessURL. + final ResIterator it = model.listSubjectsWithProperty(RDF.type, DCAT.Distribution); + int count = 0; + while (it.hasNext()) { + final Resource distribution = it.next(); + count++; + final Resource accessURL = distribution.getPropertyResourceValue(DCAT.accessURL); + final Resource downloadURL = distribution.getPropertyResourceValue(DCAT.downloadURL); + assertNotNull(downloadURL); + assertEquals(accessURL, downloadURL); + } + + assertEquals(101, count); + } + + @Test + public void addDownloadURLs_will_not_change_the_downloadURL_if_one_is_already_present() { + final Model model = parseRdf(getClass().getResourceAsStream("/with_downloadURL.xml")); + + catalogFilter.addDownloadURLs(model); + + final ResIterator it = model.listSubjectsWithProperty(RDF.type, DCAT.Distribution); + final Resource distribution = it.next(); + + assertEquals(1, distribution.listProperties(DCAT.downloadURL).toList().size()); + + final Resource accessURL = distribution.getPropertyResourceValue(DCAT.accessURL); + final Resource downloadURL = distribution.getPropertyResourceValue(DCAT.downloadURL); + assertNotNull(accessURL); + assertNotNull(downloadURL); + assertEquals("http://example.org/file.csv", downloadURL.getURI()); } } diff --git a/src/test/resources/with_downloadURL.xml b/src/test/resources/with_downloadURL.xml new file mode 100644 index 0000000000000000000000000000000000000000..655952beb3f55468829d44a0da344b24190df090 --- /dev/null +++ b/src/test/resources/with_downloadURL.xml @@ -0,0 +1,60 @@ +<?xml version="1.0" encoding="utf-8"?> +<rdf:RDF + xmlns:foaf="http://xmlns.com/foaf/0.1/" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:dcat="http://www.w3.org/ns/dcat#" + xmlns:dct="http://purl.org/dc/terms/" + xmlns:schema="http://schema.org/" + xmlns:vcard="http://www.w3.org/2006/vcard/ns#" + xmlns:dcatde="http://dcat-ap.de/def/dcatde/1.0.1/" +> + <dcat:Catalog rdf:about="http://opendata.schleswig-holstein.de"> + <dcat:dataset> + <dcat:Dataset rdf:about="http://opendata.schleswig-holstein.de/dataset/StaNord_CMS:50330"> + <dcat:keyword>Ernteberichterstattung über Feldfrüchte und Grünland in Schleswig-Holstein</dcat:keyword> + <dcatde:contributorID rdf:resource="http://dcat-ap.de/def/contributors/schleswigHolstein"/> + <dct:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2019-06-01T13:46:55.291496</dct:issued> + <dct:publisher rdf:resource="http://opendata.schleswig-holstein.de/organization/statistikamt-nord"/> + <dct:license rdf:resource="http://dcat-ap.de/def/licenses/dl-by-de/2.0"/> + <dct:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2019-06-01T13:46:55.338566</dct:modified> + <dct:description>Ernteberichterstattung über Feldfrüchte und Grünland in Schleswig-Holstein - Endgültige Erträge und vorläufige Erntemengen von Getreide und Ölfrüchten, vorläufige Kartoffelernte 2004</dct:description> + <dct:title>Ernteberichterstattung über Feldfrüchte und Grünland in Schleswig-Holstein - Endgültige Erträge und vorläufige Erntemengen von Getreide und Ölfrüchten, vorläufige Kartoffelernte 2004</dct:title> + <dct:creator> + <foaf:Organization rdf:nodeID="Nc233acd517104832885bdfe3c81f3a01"> + <foaf:name>Statistisches Amt für Hamburg und Schleswig-Holstein</foaf:name> + <foaf:mbox>info@statistik-nord.de</foaf:mbox> + </foaf:Organization> + </dct:creator> + <dct:spatial rdf:resource="http://dcat-ap.de/def/politicalGeocoding/stateKey/01"/> + <dcat:keyword>Ernte: Feldfrüchte und Grünland</dcat:keyword> + <dct:temporal> + <dct:PeriodOfTime rdf:nodeID="N5cc5efcb2099487a95f039c45a53e3d8"> + <schema:startDate rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2004-09-01T00:00:00</schema:startDate> + <schema:endDate rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2004-09-30T00:00:00</schema:endDate> + </dct:PeriodOfTime> + </dct:temporal> + <dcatde:licenseAttributionByText>Statistisches Amt für Hamburg und Schleswig-Holstein - Anstalt des öffentlichen Rechts - (Statistikamt Nord)</dcatde:licenseAttributionByText> + <dcat:theme rdf:resource="http://publications.europa.eu/resource/authority/data-theme/AGRI"/> + <dcat:contactPoint> + <vcard:Organization rdf:nodeID="N5c233ba34b6f4a2dbee00193aafa12f0"> + <vcard:fn>Statistisches Amt für Hamburg und Schleswig-Holstein</vcard:fn> + <vcard:hasEmail rdf:resource="mailto:info@statistik-nord.de"/> + </vcard:Organization> + </dcat:contactPoint> + <dct:identifier>StaNord_CMS:50330</dct:identifier> + <dcat:landingPage rdf:resource="http://www.statistik-nord.de"/> + <dcat:distribution> + <dcat:Distribution rdf:about="http://opendata.schleswig-holstein.de/dataset/StaNord_CMS:50330/resource/7b80638b-b65f-44fe-ab7a-95855db890b6"> + <dct:title>Ernteberichterstattung über Feldfrüchte und Grünland in Schleswig-Holstein - Endgültige Erträge und vorläufige Erntemengen von Getreide und Ölfrüchten, vorläufige Kartoffelernte 2004</dct:title> + <dcatde:licenseAttributionByText>Statistisches Amt für Hamburg und Schleswig-Holstein - Anstalt des öffentlichen Rechts - (Statistikamt Nord)</dcatde:licenseAttributionByText> + <dcat:accessURL rdf:resource="https://www.statistik-nord.de/fileadmin/Dokumente/Statistische_Berichte/landwirtschaft/C_II_1_m_S/C_II_1_m0409_S.pdf"/> + <dcat:downloadURL rdf:resource="http://example.org/file.csv"/> + <dcat:byteSize rdf:datatype="http://www.w3.org/2001/XMLSchema#decimal">112874985.0</dcat:byteSize> + <dct:license rdf:resource="http://dcat-ap.de/def/licenses/dl-by-de/2.0"/> + <dct:format rdf:resource="http://publications.europa.eu/resource/authority/file-type/PDF"/> + </dcat:Distribution> + </dcat:distribution> + </dcat:Dataset> + </dcat:dataset> + </dcat:Catalog> + </rdf:RDF>