diff --git a/ckanext/odsh/profiles/odsh_european_dcatap_profile.py b/ckanext/odsh/profiles/odsh_european_dcatap_profile.py new file mode 100644 index 0000000000000000000000000000000000000000..544323bd1d9b32db11f84e78b93f4a5fb364f92d --- /dev/null +++ b/ckanext/odsh/profiles/odsh_european_dcatap_profile.py @@ -0,0 +1,139 @@ +import logging +import rdflib + +from ckan.common import config, json +from ckan.model.license import LicenseRegister +from ckanext.dcat.profiles import EuropeanDCATAPProfile, DCT, URIRefOrLiteral +from ckanext.dcatde.profiles import DCAT + +log = logging.getLogger(__name__) +DCT = rdflib.namespace.Namespace("http://purl.org/dc/terms/") +DCAT = rdflib.namespace.Namespace("http://www.w3.org/ns/dcat#") + + +class ODSHEuropeanDCATAPProfile(EuropeanDCATAPProfile): + + def _license(self, dataset_ref): + if self._licenceregister_cache is not None: + license_uri2id, license_title2id = self._licenceregister_cache + else: + license_uri2id = {} + license_title2id = {} + for license_id, license in LicenseRegister().items(): + license_uri2id[license_id] = license_id + license_uri2id[license.url] = license_id + license_title2id[license.title] = license_id + self._licenceregister_cache = license_uri2id, license_title2id + + for distribution in self._distributions(dataset_ref): + # If distribution has a license, attach it to the dataset + license = self._object(distribution, DCT.license) + if license: + # Try to find a matching license comparing URIs, then titles + license_id = license_uri2id.get(license.toPython()) + if not license_id: + license_id = license_title2id.get( + self._object_value(license, DCT.title)) + if license_id: + return license_id + return '' + + def _distribution_format(self, distribution, normalize_ckan_format=True): + imt, label = super(ODSHEuropeanDCATAPProfile, self)._distribution_format( + distribution, normalize_ckan_format) + if label in resource_formats_import(): + label = resource_formats_import()[label] + return imt, label + + def graph_from_dataset(self, dataset_dict, dataset_ref): + super(ODSHEuropeanDCATAPProfile, self).graph_from_dataset( + dataset_dict, dataset_ref) + for s, p, o in self.g.triples((None, rdflib.RDF.type, DCAT.Distribution)): + for s2, p2, o2 in self.g.triples((s, DCT['format'], None)): + if o2.decode() in resource_formats_export(): + self.g.set((s, DCT['format'], rdflib.URIRef( + resource_formats_export()[o2.decode()]))) + for s, p, o in self.g.triples((None, DCT.language, None)): + if o.decode() in get_language(): + self.g.set((s, p, rdflib.URIRef(get_language()[o.decode()]))) + elif type(o) == rdflib.Literal and type(URIRefOrLiteral(o.decode())) == rdflib.URIRef: + self.g.set((s, p, rdflib.URIRef(o.decode()))) + + license = dataset_dict.get('license_id', None) + if license: + self.g.add((dataset_ref, DCT.license, rdflib.URIRef(license))) + for dist in self.g.objects(dataset_ref, DCAT.distribution): + self.g.add((dist, DCT.license, rdflib.URIRef(license))) + + +_RESOURCE_FORMATS_IMPORT = None +_RESOURCE_FORMATS_EXPORT = None + +def resource_formats(): + global _RESOURCE_FORMATS_IMPORT + global _RESOURCE_FORMATS_EXPORT + _RESOURCE_FORMATS_IMPORT = {} + _RESOURCE_FORMATS_EXPORT = {} + g = rdflib.Graph() + # Something went wrong with trying to get the file formats online, try to use backup instead + try: + fallback_filepath = config.get( + 'ckan.odsh.resource_formats_fallback_filepath') + g.parse(fallback_filepath) + assert len(set([s for s in g.subjects()])) > 120 + except: + log.exception("failed to process resource_formats") + raise Exception('failed to load formats') + file_types = [subj.decode() for subj in g.subjects()] + + for elem in sorted(set(file_types)): + if elem.split('/')[-1] != 'file-type': + _RESOURCE_FORMATS_EXPORT[elem.split('/')[-1]] = elem + _RESOURCE_FORMATS_IMPORT[elem] = elem.split('/')[-1] + +def resource_formats_export(): + global _RESOURCE_FORMATS_EXPORT + if not _RESOURCE_FORMATS_EXPORT: + resource_formats() + return _RESOURCE_FORMATS_EXPORT + +def resource_formats_import(): + global _RESOURCE_FORMATS_IMPORT + if not _RESOURCE_FORMATS_IMPORT: + resource_formats() + return _RESOURCE_FORMATS_IMPORT + + +_LANGUAGES = None + +def get_language(): + ''' When datasets are exported in rdf-format, their language-tag + should be given as + "<dct:language rdf:resource="http://publications.europa.eu/.../XXX"/>", + where XXX represents the language conforming to iso-639-3 standard. + However, some imported datasets represent their language as + "<dct:language>de</dct:language>", which will be interpreted here as + iso-639-1 values. As we do not display the language setting in the + web frontend, this function only assures the correct export format, + by using 'languages.json' as mapping table. + ''' + global _LANGUAGES + if not _LANGUAGES: + _LANGUAGES = {} + languages_file_path = config.get('ckanext.odsh.language.mapping') + if not languages_file_path: + log.warning( + "Could not find config setting: 'ckanext.odsh.language.mapping', using fallback instead.") + languages_file_path = '/usr/lib/ckan/default/src/ckanext-odsh/languages.json' + with open(languages_file_path) as languages_file: + try: + language_mapping_table = json.loads(languages_file.read()) + except ValueError, e: + # includes simplejson.decoder.JSONDecodeError + raise ValueError('Invalid JSON syntax in %s: %s' % + (languages_file_path, e)) + + for language_line in language_mapping_table: + _LANGUAGES[language_line[0]] = language_line[1] + + return _LANGUAGES \ No newline at end of file diff --git a/out.uls b/out.uls new file mode 100644 index 0000000000000000000000000000000000000000..de4db1551cc3abf0e549b6545439211fd1fe70df --- /dev/null +++ b/out.uls @@ -0,0 +1,30 @@ +V;2019-04-25 12:18:36;ckan274;"ULS";"Exception";"traceback";"Text";" +Traceback (most recent call last): +File \\"/usr/lib/ckan/default/src/ckanext-odsh/ckanext/odsh/plugin.py\\", line 358, in before_search +raise BaseException('boom') +BaseException: boom +"; +V;2019-04-25 12:18:37;ckan274;"ULS";"Exception";"traceback";"Text";" +Traceback (most recent call last): +File \\"/usr/lib/ckan/default/src/ckanext-odsh/ckanext/odsh/plugin.py\\", line 358, in before_search +raise BaseException('boom') +BaseException: boom +"; +V;2019-04-25 12:18:38;ckan274;"ULS";"Exception";"traceback";"Text";" +Traceback (most recent call last): +File \\"/usr/lib/ckan/default/src/ckanext-odsh/ckanext/odsh/plugin.py\\", line 358, in before_search +raise BaseException('boom') +BaseException: boom +"; +V;2019-04-25 12:18:40;ckan274;"ULS";"Exception";"traceback";"Text";" +Traceback (most recent call last): +File \\"/usr/lib/ckan/default/src/ckanext-odsh/ckanext/odsh/plugin.py\\", line 358, in before_search +raise BaseException('boom') +BaseException: boom +"; +V;2019-04-25 12:18:41;ckan274;"ULS";"Exception";"traceback";"Text";" +Traceback (most recent call last): +File \\"/usr/lib/ckan/default/src/ckanext-odsh/ckanext/odsh/plugin.py\\", line 358, in before_search +raise BaseException('boom') +BaseException: boom +"; diff --git a/validation.py b/validation.py new file mode 100644 index 0000000000000000000000000000000000000000..e50cbb4394ddbcc447a7fe87d200e738ba6019e6 --- /dev/null +++ b/validation.py @@ -0,0 +1,241 @@ +# This Python file uses the following encoding: utf-8 +import logging +import csv +import re +import urllib2 +import json +from itertools import count +from dateutil.parser import parse + +import ckan.plugins.toolkit as toolkit +import ckan.model as model +from ckan.lib.navl.dictization_functions import Missing + +from pylons import config + +import pdb + +_ = toolkit._ + +log = logging.getLogger(__name__) + + +def _extract_value(data, field): + key = None + for k in data.keys(): + if data[k] == field: + key = k + break + if key is None: + return None + return data[(key[0], key[1], 'value')] + + +def validate_extra_groups(data, requireAtLeastOne, errors): + value = _extract_value(data, 'groups') + if value != None: + # 'value != None' means the extra key 'groups' was found, + # so the dataset came from manual editing via the web-frontend. + if not value: + if requireAtLeastOne: + errors['groups'] = 'at least one group needed' + data[('groups', 0, 'id')] = '' + return + + groups = [g.strip() for g in value.split(',') if value.strip()] + for k in data.keys(): + if len(k) == 3 and k[0] == 'groups': + data[k] = '' + # del data[k] + if len(groups) == 0: + if requireAtLeastOne: + errors['groups'] = 'at least one group needed' + return + + for num, group in zip(range(len(groups)), groups): + data[('groups', num, 'id')] = group + else: # no extra-field 'groups' + # dataset might come from a harvest process + if not data.get(('groups', 0, 'id'), False) and \ + not data.get(('groups', 0, 'name'), False): + errors['groups'] = 'at least one group needed' + + +def validate_extras(key, data, errors, context): + extra_errors = {} + isStaNord = ('id',) in data and data[('id',)][:7] == 'StaNord' + + validate_extra_groups(data, True, extra_errors) + validate_extra_date_new(key, 'issued', data, isStaNord, extra_errors) + validate_extra_date_new(key, 'temporal_start', + data, isStaNord, extra_errors) + validate_extra_date_new(key, 'temporal_end', data, True, extra_errors) + + if len(extra_errors.values()): + raise toolkit.Invalid(extra_errors) + + +def _set_value(data, field, value): + key = None + for k in data.keys(): + if data[k] == field: + key = k + break + if key is None: + return None + data[(key[0], key[1], 'value')] = value + + +def validate_extra_date_new(key, field, data, optional, errors): + value = _extract_value(data, field) + + if not value: + if not optional: + errors[field] = 'empty' + return + else: + if re.match(r'\d\d\d\d-\d\d-\d\d', value): + try: + dt = parse(value) + _set_value(data, field, dt.isoformat()) + return + except ValueError: + pass + errors[field] = 'not a valid date' + + +def validate_licenseAttributionByText(key, data, errors, context): + register = model.Package.get_license_register() + isByLicense = False + for k in data: + if len(k) > 0 and k[0] == 'license_id' and data[k] and not isinstance(data[k], Missing) and \ + 'Namensnennung' in register[data[k]].title: + isByLicense = True + break + hasAttribution = False + for k in data: + if data[k] == 'licenseAttributionByText': + if isinstance(data[(k[0], k[1], 'value')], Missing) or (k[0], k[1], 'value') not in data: + del data[(k[0], k[1], 'value')] + del data[(k[0], k[1], 'key')] + break + else: + value = data[(k[0], k[1], 'value')] + hasAttribution = value != '' + break + if not hasAttribution: + current_indexes = [k[1] for k in data.keys() + if len(k) > 1 and k[0] == 'extras'] + + new_index = max(current_indexes) + 1 if current_indexes else 0 + data[('extras', new_index, 'key')] = 'licenseAttributionByText' + data[('extras', new_index, 'value')] = '' + + if isByLicense and not hasAttribution: + raise toolkit.Invalid( + 'licenseAttributionByText: empty not allowed') + + if not isByLicense and hasAttribution: + raise toolkit.Invalid( + 'licenseAttributionByText: text not allowed for this license') + + +def known_spatial_uri(key, data, errors, context): + value = _extract_value(data, 'spatial_uri') + + if not value: + poly = None + + # some harvesters might import a polygon directly... + # pdb.set_trace() + poly = _extract_value(data, 'spatial') + + has_old_uri = False + pkg = context.get('package', None) + if pkg: + old_uri = pkg.extras.get('spatial_uri', None) + has_old_uri = old_uri != None and len(old_uri) > 0 + if not poly: + poly = pkg.extras.get('spatial', None) + if not poly or has_old_uri: + raise toolkit.Invalid('spatial_uri: empty not allowed') + else: + if poly: + new_index = next_extra_index(data) + data[('extras', new_index+1, 'key')] = 'spatial' + data[('extras', new_index+1, 'value')] = poly + return + + mapping_file = config.get('ckanext.odsh.spatial.mapping') + try: + mapping_file = urllib2.urlopen(mapping_file) + except Exception: + raise Exception("Could not load spatial mapping file!") + + not_found = True + spatial_text = str() + spatial = str() + cr = csv.reader(mapping_file, delimiter="\t") + for row in cr: + if row[0].encode('UTF-8') == value: + not_found = False + spatial_text = row[1] + loaded = json.loads(row[2]) + spatial = json.dumps(loaded['geometry']) + break + if not_found: + raise toolkit.Invalid( + 'spatial_uri: uri unknown') + + new_index = next_extra_index(data) + + data[('extras', new_index, 'key')] = 'spatial_text' + data[('extras', new_index, 'value')] = spatial_text + data[('extras', new_index+1, 'key')] = 'spatial' + data[('extras', new_index+1, 'value')] = spatial + + +def next_extra_index(data): + current_indexes = [k[1] for k in data.keys() + if len(k) > 1 and k[0] == 'extras'] + + return max(current_indexes) + 1 if current_indexes else 0 + + +def tag_name_validator(value, context): + tagname_match = re.compile('[\w \-.\:\(\)\ยด\`]*$', re.UNICODE) + if not tagname_match.match(value): + raise toolkit.Invalid(_('Tag "%s" must be alphanumeric ' + 'characters or symbols: -_.:()') % (value)) + return value + + +def tag_string_convert(key, data, errors, context): + '''Takes a list of tags that is a comma-separated string (in data[key]) + and parses tag names. These are added to the data dict, enumerated. They + are also validated.''' + if isinstance(data[key], basestring): + tags = [tag.strip() + for tag in data[key].split(',') + if tag.strip()] + else: + tags = data[key] + + current_index = max([int(k[1]) for k in data.keys() + if len(k) == 3 and k[0] == 'tags'] + [-1]) + + for num, tag in zip(count(current_index+1), tags): + data[('tags', num, 'name')] = tag + + for tag in tags: + toolkit.get_validator('tag_length_validator')(tag, context) + tag_name_validator(tag, context) + + +def get_validators(): + return { + 'known_spatial_uri': known_spatial_uri, + 'odsh_tag_name_validator': tag_name_validator, + 'odsh_validate_extras': validate_extras, + 'validate_licenseAttributionByText': validate_licenseAttributionByText + }