Skip to content
Snippets Groups Projects
Commit ec1fc35e authored by anonymous's avatar anonymous
Browse files

ODPSH-381: convert file formats from short to long and vice-versa (DCAT)

parent b1067161
No related branches found
No related tags found
No related merge requests found
...@@ -2,10 +2,21 @@ from ckanext.dcatde.profiles import DCATdeProfile, DCATDE, DCAT, VCARD, dcat_the ...@@ -2,10 +2,21 @@ from ckanext.dcatde.profiles import DCATdeProfile, DCATDE, DCAT, VCARD, dcat_the
from ckanext.dcat.utils import resource_uri from ckanext.dcat.utils import resource_uri
from ckanext.dcat.profiles import EuropeanDCATAPProfile, DCT from ckanext.dcat.profiles import EuropeanDCATAPProfile, DCT
from ckan.model.license import LicenseRegister from ckan.model.license import LicenseRegister
import rdflib
import ckanext.dcatde.dataset_utils as ds_utils import ckanext.dcatde.dataset_utils as ds_utils
import logging import logging
from ckan.plugins import toolkit
from ckan.common import config
import sys
if sys.version_info[0] == 2:
import urllib2
elif sys.version_info[0] == 3: # >=Python3.1
import urllib
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
DCT = rdflib.namespace.Namespace("http://purl.org/dc/terms/")
DCAT = rdflib.namespace.Namespace("http://www.w3.org/ns/dcat#")
class ODSHEuropeanDCATAPProfile(EuropeanDCATAPProfile): class ODSHEuropeanDCATAPProfile(EuropeanDCATAPProfile):
...@@ -34,6 +45,19 @@ class ODSHEuropeanDCATAPProfile(EuropeanDCATAPProfile): ...@@ -34,6 +45,19 @@ class ODSHEuropeanDCATAPProfile(EuropeanDCATAPProfile):
return license_id return license_id
return '' return ''
def _distribution_format(self, distribution, normalize_ckan_format=True):
imt, label = super(ODSHEuropeanDCATAPProfile,self)._distribution_format(distribution, normalize_ckan_format)
if label in resource_formats_import():
label = resource_formats_import()[label]
return imt, label
def graph_from_dataset(self, dataset_dict, dataset_ref):
super(ODSHEuropeanDCATAPProfile,self).graph_from_dataset(dataset_dict, dataset_ref)
for s,p,o in self.g.triples((None, rdflib.RDF.type, DCAT.Distribution)):
for s2, p2, o2 in self.g.triples((s, DCT['format'], None)):
if o2.decode() in resource_formats_export():
self.g.set((s, DCT['format'], rdflib.URIRef(resource_formats_export()[o2.decode()])))
class ODSHDCATdeProfile(DCATdeProfile): class ODSHDCATdeProfile(DCATdeProfile):
def parse_dataset(self, dataset_dict, dataset_ref): def parse_dataset(self, dataset_dict, dataset_ref):
dataset_dict = super(ODSHDCATdeProfile,self).parse_dataset(dataset_dict, dataset_ref) dataset_dict = super(ODSHDCATdeProfile,self).parse_dataset(dataset_dict, dataset_ref)
...@@ -48,3 +72,62 @@ class ODSHDCATdeProfile(DCATdeProfile): ...@@ -48,3 +72,62 @@ class ODSHDCATdeProfile(DCATdeProfile):
ds_utils.insert_new_extras_field(dataset_dict, 'licenseAttributionByText', value) ds_utils.insert_new_extras_field(dataset_dict, 'licenseAttributionByText', value)
return dataset_dict return dataset_dict
return dataset_dict return dataset_dict
_RESOURCE_FORMATS_IMPORT = None
_RESOURCE_FORMATS_EXPORT = None
def resource_formats():
global _RESOURCE_FORMATS_IMPORT
global _RESOURCE_FORMATS_EXPORT
_RESOURCE_FORMATS_IMPORT = {}
_RESOURCE_FORMATS_EXPORT = {}
g = rdflib.Graph()
# at first try to get the actual file list online:
try:
format_european_url = config.get('ckan.odsh.resource_formats_url')
if not format_european_url:
format_european_url = "http://publications.europa.eu/resource/authority/file-type"
if sys.version_info[0] == 2:
urlresponse = urllib2.urlopen(urllib2.Request(format_european_url))
elif sys.version_info[0] == 3: # >=Python3.1
urlresponse = urllib.request.urlopen(urllib.request.Request(format_european_url))
g.parse(urlresponse)
# At the moment, there are 143 different file types listed,
# if less than 120 are found, something went wrong.
log.debug("filetype-count:" + str(len(set([s for s in g.subjects()])) ))
assert len(set([s for s in g.subjects()])) > 120
# Save the content as backup
if sys.version_info[0] == 2:
urlresponse = urllib2.urlopen(urllib2.Request(format_european_url))
elif sys.version_info[0] == 3: # >=Python3.1
urlresponse = urllib.request.urlopen(urllib.request.Request(format_european_url))
f = open('/usr/lib/ckan/default/src/ckanext-odsh/ckanext/odsh/fileformats.rdf', 'w')
f.write(urlresponse.read())
f.close()
except:
# Something went wrong with trying to get the file formats online, try to use backup instead
try:
g.parse('/usr/lib/ckan/default/src/ckanext-odsh/ckanext/odsh/fileformats.rdf')
assert len(set([s for s in g.subjects()])) > 120
except:
raise Exception("Could not get file formats from " + format_european_url)
file_types = [subj.decode() for subj in g.subjects()]
for elem in sorted(set(file_types)):
if elem.split('/')[-1] != 'file-type':
_RESOURCE_FORMATS_EXPORT[elem.split('/')[-1]] = elem
_RESOURCE_FORMATS_IMPORT[elem] = elem.split('/')[-1]
def resource_formats_export():
global _RESOURCE_FORMATS_EXPORT
if not _RESOURCE_FORMATS_EXPORT:
resource_formats()
return _RESOURCE_FORMATS_EXPORT
def resource_formats_import():
global _RESOURCE_FORMATS_IMPORT
if not _RESOURCE_FORMATS_IMPORT:
resource_formats()
return _RESOURCE_FORMATS_IMPORT
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment