Skip to content
Snippets Groups Projects
Commit 7669ba32 authored by Jesper Zedlitz's avatar Jesper Zedlitz
Browse files

Copy accessURL to downloadURL

Da in Schleswig-Holstein jede Distribution herunterladbar ist, kann die accessURL auch als downloadURL übernommen werden.
parent e0fd0c23
No related branches found
No related tags found
No related merge requests found
...@@ -6,6 +6,7 @@ import org.apache.jena.riot.RDFLanguages; ...@@ -6,6 +6,7 @@ import org.apache.jena.riot.RDFLanguages;
import org.apache.jena.riot.RDFParser; import org.apache.jena.riot.RDFParser;
import org.apache.jena.riot.system.ErrorHandlerFactory; import org.apache.jena.riot.system.ErrorHandlerFactory;
import org.apache.jena.util.ResourceUtils; import org.apache.jena.util.ResourceUtils;
import org.apache.jena.vocabulary.DCAT;
import org.apache.jena.vocabulary.DCTerms; import org.apache.jena.vocabulary.DCTerms;
import org.apache.jena.vocabulary.RDF; import org.apache.jena.vocabulary.RDF;
import org.springframework.beans.factory.annotation.Value; import org.springframework.beans.factory.annotation.Value;
...@@ -35,7 +36,7 @@ public class CatalogFilter { ...@@ -35,7 +36,7 @@ public class CatalogFilter {
} }
Model work(InputStream in) { Model work(InputStream in) {
Model model = ModelFactory.createDefaultModel(); final Model model = ModelFactory.createDefaultModel();
RDFParser.create() RDFParser.create()
.source(in) .source(in)
...@@ -44,16 +45,16 @@ public class CatalogFilter { ...@@ -44,16 +45,16 @@ public class CatalogFilter {
.base(baseURL) .base(baseURL)
.parse(model); .parse(model);
Set<String> usedDistributionIds = new HashSet<>(); final Set<String> usedDistributionIds = new HashSet<>();
final ResIterator it = model.listSubjectsWithProperty(RDF.type, ResourceFactory.createResource("http://www.w3.org/ns/dcat#Dataset")); final ResIterator it = model.listSubjectsWithProperty(RDF.type, DCAT.Dataset);
while (it.hasNext()) { while (it.hasNext()) {
Resource dataset = it.next(); Resource dataset = it.next();
if (hasAtLeastOneValidDistribution(dataset)) { if (hasAtLeastOneValidDistribution(dataset)) {
usedDistributionIds.addAll(getDistributionsForDataset(dataset)); usedDistributionIds.addAll(getDistributionsForDataset(dataset));
} else { } else {
model.remove(dataset.listProperties()); model.remove(dataset.listProperties());
model.remove(model.listStatements(null, ResourceFactory.createProperty("http://www.w3.org/ns/dcat#dataset"), dataset)); model.remove(model.listStatements(null, DCAT.dataset, dataset));
} }
} }
...@@ -62,19 +63,40 @@ public class CatalogFilter { ...@@ -62,19 +63,40 @@ public class CatalogFilter {
removeUnusedLocations(model); removeUnusedLocations(model);
minimizeLocations(model); minimizeLocations(model);
rewriteHydraURLs(model); rewriteHydraURLs(model);
addDownloadURLs(model);
return model; return model;
} }
/**
* Add downloadURL properties to Distributions. The German DCAT-AP.de treats downloadURL as an no so
* important optional properties and relies the the accessURL. However, the European data portal values the
* downloadURL property highly.
*/
void addDownloadURLs(Model model) {
final ResIterator it = model.listSubjectsWithProperty(RDF.type, DCAT.Distribution);
while (it.hasNext()) {
final Resource distribution = it.next();
final Resource accessURL = distribution.getPropertyResourceValue(DCAT.accessURL);
final Resource downloadURL = distribution.getPropertyResourceValue(DCAT.downloadURL);
if( downloadURL == null ) {
distribution.addProperty(DCAT.downloadURL, accessURL);
}
}
}
void rewriteHydraURLs(Model model) { void rewriteHydraURLs(Model model) {
final ResIterator it = model.listSubjectsWithProperty(RDF.type, ResourceFactory.createResource("http://www.w3.org/ns/hydra/core#PagedCollection")); final ResIterator it = model.listSubjectsWithProperty(RDF.type, ResourceFactory.createResource("http://www.w3.org/ns/hydra/core#PagedCollection"));
if (it.hasNext()) { if (it.hasNext()) {
final Resource pagedCollection = it.nextResource(); final Resource pagedCollection = it.nextResource();
final String originalURL = StringUtils.substringBefore(pagedCollection.getURI(), "catalog.xml"); final String originalURL = StringUtils.substringBefore(pagedCollection.getURI(), "catalog.xml");
List<Statement> changeStatements = new ArrayList<>(); final List<Statement> changeStatements = new ArrayList<>();
StmtIterator iterator = pagedCollection.listProperties(); final StmtIterator iterator = pagedCollection.listProperties();
while (iterator.hasNext()) { while (iterator.hasNext()) {
Statement stmt = iterator.next(); Statement stmt = iterator.next();
if (stmt.getObject().isLiteral()) { if (stmt.getObject().isLiteral()) {
...@@ -92,7 +114,6 @@ public class CatalogFilter { ...@@ -92,7 +114,6 @@ public class CatalogFilter {
} }
ResourceUtils.renameResource(pagedCollection, pagedCollection.getURI().replaceFirst(originalURL, baseURL)); ResourceUtils.renameResource(pagedCollection, pagedCollection.getURI().replaceFirst(originalURL, baseURL));
} }
} }
...@@ -130,7 +151,7 @@ public class CatalogFilter { ...@@ -130,7 +151,7 @@ public class CatalogFilter {
void removeAnonymousResources(Model model) { void removeAnonymousResources(Model model) {
final ResIterator it = model.listSubjects(); final ResIterator it = model.listSubjects();
Collection<Resource> allObjects = allObjects(model); final Collection<Resource> allObjects = allObjects(model);
while (it.hasNext()) { while (it.hasNext()) {
Resource resource = it.next(); Resource resource = it.next();
...@@ -141,8 +162,8 @@ public class CatalogFilter { ...@@ -141,8 +162,8 @@ public class CatalogFilter {
} }
Collection<Resource> allObjects(Model model) { Collection<Resource> allObjects(Model model) {
Set<Resource> result = new HashSet<>(); final Set<Resource> result = new HashSet<>();
NodeIterator it = model.listObjects(); final NodeIterator it = model.listObjects();
while (it.hasNext()) { while (it.hasNext()) {
RDFNode next = it.next(); RDFNode next = it.next();
if (next.isResource()) { if (next.isResource()) {
...@@ -156,9 +177,9 @@ public class CatalogFilter { ...@@ -156,9 +177,9 @@ public class CatalogFilter {
* Entfernt aus dem Model alle dcat:Distribution Instanzen, deren URI nicht in der angegebenen Collection enthalten sind. * Entfernt aus dem Model alle dcat:Distribution Instanzen, deren URI nicht in der angegebenen Collection enthalten sind.
*/ */
void removeUnusedDistributions(Model model, Collection<String> usedDistributionIds) { void removeUnusedDistributions(Model model, Collection<String> usedDistributionIds) {
final ResIterator it = model.listSubjectsWithProperty(RDF.type, ResourceFactory.createResource("http://www.w3.org/ns/dcat#Distribution")); final ResIterator it = model.listSubjectsWithProperty(RDF.type, DCAT.Distribution);
while (it.hasNext()) { while (it.hasNext()) {
Resource distribution = it.next(); final Resource distribution = it.next();
if (!usedDistributionIds.contains(distribution.getURI())) { if (!usedDistributionIds.contains(distribution.getURI())) {
model.remove(distribution.listProperties()); model.remove(distribution.listProperties());
} }
...@@ -166,13 +187,11 @@ public class CatalogFilter { ...@@ -166,13 +187,11 @@ public class CatalogFilter {
} }
Collection<String> getDistributionsForDataset(Resource dataset) { Collection<String> getDistributionsForDataset(Resource dataset) {
Set<String> result = new HashSet<>(); final Set<String> result = new HashSet<>();
StmtIterator it = dataset.listProperties(ResourceFactory.createProperty("http://www.w3.org/ns/dcat#distribution")); final StmtIterator it = dataset.listProperties(DCAT.distribution);
while (it.hasNext()) { while (it.hasNext()) {
Statement next = it.next(); final Statement next = it.next();
final Resource distribution = next.getObject().asResource();
Resource distribution = next.getObject().asResource();
result.add(distribution.getURI()); result.add(distribution.getURI());
} }
...@@ -180,14 +199,14 @@ public class CatalogFilter { ...@@ -180,14 +199,14 @@ public class CatalogFilter {
} }
boolean hasAtLeastOneValidDistribution(Resource dataset) { boolean hasAtLeastOneValidDistribution(Resource dataset) {
StmtIterator it = dataset.listProperties(ResourceFactory.createProperty("http://www.w3.org/ns/dcat#distribution")); final StmtIterator it = dataset.listProperties(ResourceFactory.createProperty("http://www.w3.org/ns/dcat#distribution"));
boolean atLeastOneValidFormat = false; boolean atLeastOneValidFormat = false;
while (it.hasNext()) { while (it.hasNext()) {
Statement next = it.next(); final Statement next = it.next();
Resource distribution = next.getObject().asResource(); final Resource distribution = next.getObject().asResource();
RDFNode format = distribution.getProperty(DCTerms.format).getObject(); final RDFNode format = distribution.getProperty(DCTerms.format).getObject();
if (!UNWANTED_FORMATS.contains(format)) { if (!UNWANTED_FORMATS.contains(format)) {
atLeastOneValidFormat = true; atLeastOneValidFormat = true;
} }
......
<configuration>
<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
<encoder>
<pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
</encoder>
</appender>
<root level="WARN">
<appender-ref ref="STDOUT" />
</root>
<logger name="de.landsh.opendata" level="DEBUG"/>
</configuration>
\ No newline at end of file
package de.landsh.opendata.catalogproxy; package de.landsh.opendata.catalogproxy;
import org.apache.jena.rdf.model.*; import org.apache.jena.rdf.model.Model;
import org.apache.jena.rdf.model.ModelFactory;
import org.apache.jena.rdf.model.ResIterator;
import org.apache.jena.rdf.model.Resource;
import org.apache.jena.riot.RDFLanguages; import org.apache.jena.riot.RDFLanguages;
import org.apache.jena.riot.RDFParser; import org.apache.jena.riot.RDFParser;
import org.apache.jena.riot.system.ErrorHandlerFactory; import org.apache.jena.riot.system.ErrorHandlerFactory;
import org.apache.jena.vocabulary.DCAT;
import org.apache.jena.vocabulary.RDF; import org.apache.jena.vocabulary.RDF;
import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
...@@ -13,6 +17,8 @@ import java.io.InputStream; ...@@ -13,6 +17,8 @@ import java.io.InputStream;
import java.io.StringWriter; import java.io.StringWriter;
import java.util.Collections; import java.util.Collections;
import static org.junit.jupiter.api.Assertions.*;
public class CatalogFilterTest { public class CatalogFilterTest {
private final CatalogFilter catalogFilter = new CatalogFilter(); private final CatalogFilter catalogFilter = new CatalogFilter();
...@@ -55,23 +61,62 @@ public class CatalogFilterTest { ...@@ -55,23 +61,62 @@ public class CatalogFilterTest {
@Test @Test
public void removeUnusedResources_removeAll() { public void removeUnusedResources_removeAll() {
Model model = parseRdf(getClass().getResourceAsStream("/catalog.xml")); final Model model = parseRdf(getClass().getResourceAsStream("/catalog.xml"));
catalogFilter.removeUnusedDistributions(model, Collections.emptySet()); catalogFilter.removeUnusedDistributions(model, Collections.emptySet());
Assertions.assertEquals(0, countInstances(model, ResourceFactory.createResource("http://www.w3.org/ns/dcat#Distribution"))); Assertions.assertEquals(0, countInstances(model, DCAT.Distribution));
} }
@Test @Test
public void rewriteHydraURLs() { public void rewriteHydraURLs() {
Model model = parseRdf(getClass().getResourceAsStream("/hydra.xml")); final Model model = parseRdf(getClass().getResourceAsStream("/hydra.xml"));
catalogFilter.rewriteHydraURLs(model); catalogFilter.rewriteHydraURLs(model);
StringWriter sw = new StringWriter(); StringWriter sw = new StringWriter();
model.write(sw); model.write(sw);
final String result = sw.toString(); final String result = sw.toString();
Assertions.assertFalse(result.contains("http://opendata.schleswig-holstein.de/catalog.xml")); assertFalse(result.contains("http://opendata.schleswig-holstein.de/catalog.xml"));
Assertions.assertTrue(result.contains("https://example.org/catalog.xml?page=84")); assertTrue(result.contains("https://example.org/catalog.xml?page=84"));
}
@Test
public void addDownloadURLs_will_add_accessURLs() {
final Model model = parseRdf(getClass().getResourceAsStream("/catalog.xml"));
catalogFilter.addDownloadURLs(model);
// Every distribution has a downloadURL with the same value as the accessURL.
final ResIterator it = model.listSubjectsWithProperty(RDF.type, DCAT.Distribution);
int count = 0;
while (it.hasNext()) {
final Resource distribution = it.next();
count++;
final Resource accessURL = distribution.getPropertyResourceValue(DCAT.accessURL);
final Resource downloadURL = distribution.getPropertyResourceValue(DCAT.downloadURL);
assertNotNull(downloadURL);
assertEquals(accessURL, downloadURL);
}
assertEquals(101, count);
}
@Test
public void addDownloadURLs_will_not_change_the_downloadURL_if_one_is_already_present() {
final Model model = parseRdf(getClass().getResourceAsStream("/with_downloadURL.xml"));
catalogFilter.addDownloadURLs(model);
final ResIterator it = model.listSubjectsWithProperty(RDF.type, DCAT.Distribution);
final Resource distribution = it.next();
assertEquals(1, distribution.listProperties(DCAT.downloadURL).toList().size());
final Resource accessURL = distribution.getPropertyResourceValue(DCAT.accessURL);
final Resource downloadURL = distribution.getPropertyResourceValue(DCAT.downloadURL);
assertNotNull(accessURL);
assertNotNull(downloadURL);
assertEquals("http://example.org/file.csv", downloadURL.getURI());
} }
} }
<?xml version="1.0" encoding="utf-8"?>
<rdf:RDF
xmlns:foaf="http://xmlns.com/foaf/0.1/"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:dcat="http://www.w3.org/ns/dcat#"
xmlns:dct="http://purl.org/dc/terms/"
xmlns:schema="http://schema.org/"
xmlns:vcard="http://www.w3.org/2006/vcard/ns#"
xmlns:dcatde="http://dcat-ap.de/def/dcatde/1.0.1/"
>
<dcat:Catalog rdf:about="http://opendata.schleswig-holstein.de">
<dcat:dataset>
<dcat:Dataset rdf:about="http://opendata.schleswig-holstein.de/dataset/StaNord_CMS:50330">
<dcat:keyword>Ernteberichterstattung über Feldfrüchte und Grünland in Schleswig-Holstein</dcat:keyword>
<dcatde:contributorID rdf:resource="http://dcat-ap.de/def/contributors/schleswigHolstein"/>
<dct:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2019-06-01T13:46:55.291496</dct:issued>
<dct:publisher rdf:resource="http://opendata.schleswig-holstein.de/organization/statistikamt-nord"/>
<dct:license rdf:resource="http://dcat-ap.de/def/licenses/dl-by-de/2.0"/>
<dct:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2019-06-01T13:46:55.338566</dct:modified>
<dct:description>Ernteberichterstattung über Feldfrüchte und Grünland in Schleswig-Holstein - Endgültige Erträge und vorläufige Erntemengen von Getreide und Ölfrüchten, vorläufige Kartoffelernte 2004</dct:description>
<dct:title>Ernteberichterstattung über Feldfrüchte und Grünland in Schleswig-Holstein - Endgültige Erträge und vorläufige Erntemengen von Getreide und Ölfrüchten, vorläufige Kartoffelernte 2004</dct:title>
<dct:creator>
<foaf:Organization rdf:nodeID="Nc233acd517104832885bdfe3c81f3a01">
<foaf:name>Statistisches Amt für Hamburg und Schleswig-Holstein</foaf:name>
<foaf:mbox>info@statistik-nord.de</foaf:mbox>
</foaf:Organization>
</dct:creator>
<dct:spatial rdf:resource="http://dcat-ap.de/def/politicalGeocoding/stateKey/01"/>
<dcat:keyword>Ernte: Feldfrüchte und Grünland</dcat:keyword>
<dct:temporal>
<dct:PeriodOfTime rdf:nodeID="N5cc5efcb2099487a95f039c45a53e3d8">
<schema:startDate rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2004-09-01T00:00:00</schema:startDate>
<schema:endDate rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2004-09-30T00:00:00</schema:endDate>
</dct:PeriodOfTime>
</dct:temporal>
<dcatde:licenseAttributionByText>Statistisches Amt für Hamburg und Schleswig-Holstein - Anstalt des öffentlichen Rechts - (Statistikamt Nord)</dcatde:licenseAttributionByText>
<dcat:theme rdf:resource="http://publications.europa.eu/resource/authority/data-theme/AGRI"/>
<dcat:contactPoint>
<vcard:Organization rdf:nodeID="N5c233ba34b6f4a2dbee00193aafa12f0">
<vcard:fn>Statistisches Amt für Hamburg und Schleswig-Holstein</vcard:fn>
<vcard:hasEmail rdf:resource="mailto:info@statistik-nord.de"/>
</vcard:Organization>
</dcat:contactPoint>
<dct:identifier>StaNord_CMS:50330</dct:identifier>
<dcat:landingPage rdf:resource="http://www.statistik-nord.de"/>
<dcat:distribution>
<dcat:Distribution rdf:about="http://opendata.schleswig-holstein.de/dataset/StaNord_CMS:50330/resource/7b80638b-b65f-44fe-ab7a-95855db890b6">
<dct:title>Ernteberichterstattung über Feldfrüchte und Grünland in Schleswig-Holstein - Endgültige Erträge und vorläufige Erntemengen von Getreide und Ölfrüchten, vorläufige Kartoffelernte 2004</dct:title>
<dcatde:licenseAttributionByText>Statistisches Amt für Hamburg und Schleswig-Holstein - Anstalt des öffentlichen Rechts - (Statistikamt Nord)</dcatde:licenseAttributionByText>
<dcat:accessURL rdf:resource="https://www.statistik-nord.de/fileadmin/Dokumente/Statistische_Berichte/landwirtschaft/C_II_1_m_S/C_II_1_m0409_S.pdf"/>
<dcat:downloadURL rdf:resource="http://example.org/file.csv"/>
<dcat:byteSize rdf:datatype="http://www.w3.org/2001/XMLSchema#decimal">112874985.0</dcat:byteSize>
<dct:license rdf:resource="http://dcat-ap.de/def/licenses/dl-by-de/2.0"/>
<dct:format rdf:resource="http://publications.europa.eu/resource/authority/file-type/PDF"/>
</dcat:Distribution>
</dcat:distribution>
</dcat:Dataset>
</dcat:dataset>
</dcat:Catalog>
</rdf:RDF>
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment