From ec1fc35e8adee240d11af993a6d1deef733ab552 Mon Sep 17 00:00:00 2001
From: anonymous <anonymous>
Date: Thu, 25 Apr 2019 15:22:46 +0200
Subject: [PATCH] ODPSH-381: convert file formats from short to long and
 vice-versa (DCAT)

---
 ckanext/odsh/profiles.py | 83 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)

diff --git a/ckanext/odsh/profiles.py b/ckanext/odsh/profiles.py
index a3b2933a..0c57a384 100644
--- a/ckanext/odsh/profiles.py
+++ b/ckanext/odsh/profiles.py
@@ -2,10 +2,21 @@ from ckanext.dcatde.profiles import DCATdeProfile, DCATDE, DCAT, VCARD, dcat_the
 from ckanext.dcat.utils import resource_uri
 from ckanext.dcat.profiles import EuropeanDCATAPProfile, DCT
 from ckan.model.license import LicenseRegister
+import rdflib
 import ckanext.dcatde.dataset_utils as ds_utils
 import logging
+from ckan.plugins import toolkit
+from ckan.common import config
+
+import sys
+if sys.version_info[0] == 2:
+    import urllib2
+elif sys.version_info[0] == 3:  # >=Python3.1
+    import urllib
 
 log = logging.getLogger(__name__)
+DCT = rdflib.namespace.Namespace("http://purl.org/dc/terms/")
+DCAT = rdflib.namespace.Namespace("http://www.w3.org/ns/dcat#")
 
 class ODSHEuropeanDCATAPProfile(EuropeanDCATAPProfile):
 
@@ -34,6 +45,19 @@ class ODSHEuropeanDCATAPProfile(EuropeanDCATAPProfile):
                     return license_id
         return ''
 
+    def _distribution_format(self, distribution, normalize_ckan_format=True):
+        imt, label = super(ODSHEuropeanDCATAPProfile,self)._distribution_format(distribution, normalize_ckan_format)            
+        if label in resource_formats_import():
+            label = resource_formats_import()[label]
+        return imt, label
+        
+    def graph_from_dataset(self, dataset_dict, dataset_ref):
+        super(ODSHEuropeanDCATAPProfile,self).graph_from_dataset(dataset_dict, dataset_ref)
+        for s,p,o in self.g.triples((None, rdflib.RDF.type, DCAT.Distribution)):
+            for s2, p2, o2 in self.g.triples((s, DCT['format'], None)):
+                if o2.decode() in resource_formats_export():
+                    self.g.set((s, DCT['format'], rdflib.URIRef(resource_formats_export()[o2.decode()])))
+
 class ODSHDCATdeProfile(DCATdeProfile):
     def parse_dataset(self, dataset_dict, dataset_ref):
         dataset_dict = super(ODSHDCATdeProfile,self).parse_dataset(dataset_dict, dataset_ref)
@@ -48,3 +72,62 @@ class ODSHDCATdeProfile(DCATdeProfile):
                             ds_utils.insert_new_extras_field(dataset_dict, 'licenseAttributionByText', value)
                             return dataset_dict
         return dataset_dict
+        
+_RESOURCE_FORMATS_IMPORT = None
+_RESOURCE_FORMATS_EXPORT = None
+
+def resource_formats():
+    global _RESOURCE_FORMATS_IMPORT
+    global _RESOURCE_FORMATS_EXPORT
+    _RESOURCE_FORMATS_IMPORT = {}
+    _RESOURCE_FORMATS_EXPORT = {}
+    g = rdflib.Graph()
+
+    # at first try to get the actual file list online:
+    try:
+        format_european_url = config.get('ckan.odsh.resource_formats_url')
+
+        if not format_european_url:
+            format_european_url = "http://publications.europa.eu/resource/authority/file-type"
+        if sys.version_info[0] == 2:
+            urlresponse = urllib2.urlopen(urllib2.Request(format_european_url))
+        elif sys.version_info[0] == 3:  # >=Python3.1
+            urlresponse = urllib.request.urlopen(urllib.request.Request(format_european_url))
+        g.parse(urlresponse)
+        # At the moment, there are 143 different file types listed, 
+        # if less than 120 are found, something went wrong.
+        log.debug("filetype-count:" + str(len(set([s for s in g.subjects()])) ))
+        assert len(set([s for s in g.subjects()])) > 120
+        # Save the content as backup
+        if sys.version_info[0] == 2:
+            urlresponse = urllib2.urlopen(urllib2.Request(format_european_url))
+        elif sys.version_info[0] == 3:  # >=Python3.1
+            urlresponse = urllib.request.urlopen(urllib.request.Request(format_european_url))
+        f = open('/usr/lib/ckan/default/src/ckanext-odsh/ckanext/odsh/fileformats.rdf', 'w')
+        f.write(urlresponse.read())
+        f.close()
+    except:
+        # Something went wrong with trying to get the file formats online, try to use backup instead
+        try:
+            g.parse('/usr/lib/ckan/default/src/ckanext-odsh/ckanext/odsh/fileformats.rdf')
+            assert len(set([s for s in g.subjects()])) > 120
+        except:
+            raise Exception("Could not get file formats from " + format_european_url)
+    file_types = [subj.decode() for subj in g.subjects()]
+    
+    for elem in sorted(set(file_types)):
+        if elem.split('/')[-1] != 'file-type':
+            _RESOURCE_FORMATS_EXPORT[elem.split('/')[-1]] = elem
+            _RESOURCE_FORMATS_IMPORT[elem] = elem.split('/')[-1]
+
+def resource_formats_export():
+    global _RESOURCE_FORMATS_EXPORT
+    if not _RESOURCE_FORMATS_EXPORT:
+        resource_formats()
+    return _RESOURCE_FORMATS_EXPORT
+    
+def resource_formats_import():
+    global _RESOURCE_FORMATS_IMPORT
+    if not _RESOURCE_FORMATS_IMPORT:
+        resource_formats()
+    return _RESOURCE_FORMATS_IMPORT
-- 
GitLab