From ec1fc35e8adee240d11af993a6d1deef733ab552 Mon Sep 17 00:00:00 2001 From: anonymous <anonymous> Date: Thu, 25 Apr 2019 15:22:46 +0200 Subject: [PATCH] ODPSH-381: convert file formats from short to long and vice-versa (DCAT) --- ckanext/odsh/profiles.py | 83 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/ckanext/odsh/profiles.py b/ckanext/odsh/profiles.py index a3b2933a..0c57a384 100644 --- a/ckanext/odsh/profiles.py +++ b/ckanext/odsh/profiles.py @@ -2,10 +2,21 @@ from ckanext.dcatde.profiles import DCATdeProfile, DCATDE, DCAT, VCARD, dcat_the from ckanext.dcat.utils import resource_uri from ckanext.dcat.profiles import EuropeanDCATAPProfile, DCT from ckan.model.license import LicenseRegister +import rdflib import ckanext.dcatde.dataset_utils as ds_utils import logging +from ckan.plugins import toolkit +from ckan.common import config + +import sys +if sys.version_info[0] == 2: + import urllib2 +elif sys.version_info[0] == 3: # >=Python3.1 + import urllib log = logging.getLogger(__name__) +DCT = rdflib.namespace.Namespace("http://purl.org/dc/terms/") +DCAT = rdflib.namespace.Namespace("http://www.w3.org/ns/dcat#") class ODSHEuropeanDCATAPProfile(EuropeanDCATAPProfile): @@ -34,6 +45,19 @@ class ODSHEuropeanDCATAPProfile(EuropeanDCATAPProfile): return license_id return '' + def _distribution_format(self, distribution, normalize_ckan_format=True): + imt, label = super(ODSHEuropeanDCATAPProfile,self)._distribution_format(distribution, normalize_ckan_format) + if label in resource_formats_import(): + label = resource_formats_import()[label] + return imt, label + + def graph_from_dataset(self, dataset_dict, dataset_ref): + super(ODSHEuropeanDCATAPProfile,self).graph_from_dataset(dataset_dict, dataset_ref) + for s,p,o in self.g.triples((None, rdflib.RDF.type, DCAT.Distribution)): + for s2, p2, o2 in self.g.triples((s, DCT['format'], None)): + if o2.decode() in resource_formats_export(): + self.g.set((s, DCT['format'], rdflib.URIRef(resource_formats_export()[o2.decode()]))) + class ODSHDCATdeProfile(DCATdeProfile): def parse_dataset(self, dataset_dict, dataset_ref): dataset_dict = super(ODSHDCATdeProfile,self).parse_dataset(dataset_dict, dataset_ref) @@ -48,3 +72,62 @@ class ODSHDCATdeProfile(DCATdeProfile): ds_utils.insert_new_extras_field(dataset_dict, 'licenseAttributionByText', value) return dataset_dict return dataset_dict + +_RESOURCE_FORMATS_IMPORT = None +_RESOURCE_FORMATS_EXPORT = None + +def resource_formats(): + global _RESOURCE_FORMATS_IMPORT + global _RESOURCE_FORMATS_EXPORT + _RESOURCE_FORMATS_IMPORT = {} + _RESOURCE_FORMATS_EXPORT = {} + g = rdflib.Graph() + + # at first try to get the actual file list online: + try: + format_european_url = config.get('ckan.odsh.resource_formats_url') + + if not format_european_url: + format_european_url = "http://publications.europa.eu/resource/authority/file-type" + if sys.version_info[0] == 2: + urlresponse = urllib2.urlopen(urllib2.Request(format_european_url)) + elif sys.version_info[0] == 3: # >=Python3.1 + urlresponse = urllib.request.urlopen(urllib.request.Request(format_european_url)) + g.parse(urlresponse) + # At the moment, there are 143 different file types listed, + # if less than 120 are found, something went wrong. + log.debug("filetype-count:" + str(len(set([s for s in g.subjects()])) )) + assert len(set([s for s in g.subjects()])) > 120 + # Save the content as backup + if sys.version_info[0] == 2: + urlresponse = urllib2.urlopen(urllib2.Request(format_european_url)) + elif sys.version_info[0] == 3: # >=Python3.1 + urlresponse = urllib.request.urlopen(urllib.request.Request(format_european_url)) + f = open('/usr/lib/ckan/default/src/ckanext-odsh/ckanext/odsh/fileformats.rdf', 'w') + f.write(urlresponse.read()) + f.close() + except: + # Something went wrong with trying to get the file formats online, try to use backup instead + try: + g.parse('/usr/lib/ckan/default/src/ckanext-odsh/ckanext/odsh/fileformats.rdf') + assert len(set([s for s in g.subjects()])) > 120 + except: + raise Exception("Could not get file formats from " + format_european_url) + file_types = [subj.decode() for subj in g.subjects()] + + for elem in sorted(set(file_types)): + if elem.split('/')[-1] != 'file-type': + _RESOURCE_FORMATS_EXPORT[elem.split('/')[-1]] = elem + _RESOURCE_FORMATS_IMPORT[elem] = elem.split('/')[-1] + +def resource_formats_export(): + global _RESOURCE_FORMATS_EXPORT + if not _RESOURCE_FORMATS_EXPORT: + resource_formats() + return _RESOURCE_FORMATS_EXPORT + +def resource_formats_import(): + global _RESOURCE_FORMATS_IMPORT + if not _RESOURCE_FORMATS_IMPORT: + resource_formats() + return _RESOURCE_FORMATS_IMPORT -- GitLab