diff --git a/ckanext/odsh/harvesters/ckan_mapper.py b/ckanext/odsh/harvesters/ckan_mapper.py new file mode 100644 index 0000000000000000000000000000000000000000..da5f618f88d6229e8349bf2adee9068504771b8e --- /dev/null +++ b/ckanext/odsh/harvesters/ckan_mapper.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python +""" +Mapper, that loads all necessary mapping files, calls the pyjq method and returns valid ckan-formatted values +""" +import json +import pyjq as pyjq + +# import the "stat_amt_nord" configuration file to determine +# 1st the mapping from statistikamt nord fields onto ckan-fields and +# 2nd the mapping form the category numbers onto the MDR data-theme authority codes +import config_stat_amt_nord as sta_amt_nord + + +def pyjq_mapper(config_filter, value, numbers): + """ + :param config_filter: delivery system specific configuration string + :param value: input, to map onto the ckan format + :param numbers: delivery system specific mapping from numbers to MDR - authority codes + :return: valid ckan formatted value + """ + + if config_filter == "": + raise ValueError('Config string can not be empty.') + else: + tmp = pyjq.all(config_filter, value, vars={"numbers": numbers}) + + # print "tmp cm: " + str(tmp) + return dict(tmp[0]) + + +# for very little testing reasons regarding the pyjq lib only +test_config_filter = sta_amt_nord.config_filter +test_numbers = sta_amt_nord.numbers + +#with open('/usr/lib/ckan/default/src/ckanext-odsh/ckanext/odsh/harvesters/statistik-nord-example_2.json') as f: +# input = json.load(f) +# print "Input: " + str(input) +# result = dict(pyjq_mapper(test_config_filter, input, test_numbers)) +#print "Result: " + str(result) + + + diff --git a/ckanext/odsh/harvesters/config_stat_amt_nord.py b/ckanext/odsh/harvesters/config_stat_amt_nord.py new file mode 100644 index 0000000000000000000000000000000000000000..4afcca1efc4d04378484d8b3ce4dfb72586a10d6 --- /dev/null +++ b/ckanext/odsh/harvesters/config_stat_amt_nord.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python +""" +Some Variables needed for configuration of the ckan_mapper function. +It supplys a filter string needed by the pyjq-mapper and a dicitonary containing the statistikamt nord +category numbers and the corresponding MDR data-theme authority codes. + +The config_filter string is build up like a json string. +To generate this string, visit https://stedolan.github.io/jq/manual/. +The keys are valid CKAN-Fields, and the values are the paths to +the corresponding values within the Statistikamt Nord datasets. +""" + +config_filter = '{"author" : .VeroeffentlichendeStelle.Name,' \ + '"url" : .WeitereInformationen.URL,' \ + '"notes" : .Beschreibung,' \ + '"title" : .Titel,' \ + '"author_email": .VeroeffentlichendeStelle.EMailAdresse,' \ + '"extras": [' \ + ' {"key": "identifier", "value": (.Quelle + ":" +.DokumentID)},' \ + ' {"key": "issued", "value": .Veroeffentlichungsdatum?},' \ + ' {"key": "modified", "value": .Aenderungsdatum?},' \ + ' {"key": "temporal_start", "value": .ZeitraumVon?},' \ + ' {"key": "temporal_end", "value": .ZeitraumBis?},' \ + ' ([$numbers[.StANKategorie][]?] | {"key":"theme", "value": .})],' \ + '"resources" : [.Ressourcen.Ressource[] | {' \ + ' "url":.URLZumDownload,' \ + ' "size" : (.Dateigroesse | tonumber | . * 1000000 | floor),' \ + ' "name" : .Ressourcenname,' \ + ' "format": .Format.FormatTyp,' \ + ' "license": "dl-by-de/2.0"}],' \ + '"license_id" : "dl-by-de/2.0",' \ + '"type" : "dataset",' \ + '"tags" : [.Schlagwoerter |.Schlagwort[] | gsub(", "; ",") | select(length > 0) |' \ + ' split(",") |.[] |.+ "" | if (. | length) <= 100 then {"name" : .} else empty end],' \ + '"groups" : [$numbers[.StANKategorie][]? | {"name" : .}]}' + +# This mapping should be exchanged by the official mapping list from statistikamt nord +numbers = { +"59" : ["econ", " agri"], +"56" : ["econ"], +"74" : ["econ"], +"48" : ["regi"], +"50" : ["regi"], +"68" : ["econ"], +"54" : ["econ"], +"116" : ["soci"], +"7" : ["soci"], +"9" : ["soci"], +"8" : ["soci"], +"93" : ["gove"], +"36" : ["educ"], +"75" : ["econ"], +"58" : ["tech"], +"90" : ["gove"], +"91" : ["gove"], +"73" : ["econ"], +"55" : ["econ"], +"31" : ["soci"], +"87" : ["ener", " envi"], +"60" : ["econ"], +"53" : ["econ"], +"52" : ["econ"], +"92" : ["gove"], +"10" : ["soci"], +"79" : ["gove", " econ"], +"72" : ["intr"], +"117" : ["regi"], +"49" : ["regi"], +"35" : ["regi"], +"43" : ["heal"], +"42" : ["heal", "soci"], +"64" : ["econ"], +"70" : ["intr"], +"69" : ["econ"], +"27" : ["soci"], +"28" : ["soci", "econ"], +"38" : ["educ"], +"108" : ["econ"], +"67" : ["econ"], +"66" : ["econ"], +"65" : ["econ"], +"46" : ["soci"], +"39" : ["intr"], +"57" : ["agri"], +"115" : ["agri"], +"80" : ["gove"], +"82" : ["gove"], +"47" : ["soci"], +"78" : ["soci"], +"40" : ["just"], +"85" : ["tran", "econ"], +"37" : ["educ"], +"45" : ["soci"], +"81" : ["gove"], +"84" : ["tran", "econ"], +"44" : ["heal"], +"71" : ["educ"], +"88" : ["envi"], +"86" : ["envi"], +"62" : ["econ"], +"63" : ["econ"], +"83" : ["tran", "envi", "ener"], +"77" : ["econ"], +"61" : ["econ"], +"98" : ["gove"], +"76" : ["econ"], +"89" : ["gove"], +"41" : ["educ"], +"51" : ["regi"], +"111" : ["soci", "regi"] +} diff --git a/ckanext/odsh/harvesters/statistik-nord-example_2.json b/ckanext/odsh/harvesters/statistik-nord-example_2.json new file mode 100644 index 0000000000000000000000000000000000000000..7abacf71235940042df82cb98db6e695ec22bb93 --- /dev/null +++ b/ckanext/odsh/harvesters/statistik-nord-example_2.json @@ -0,0 +1,38 @@ +{"TypDesInhalts": "dokument", + "Quelle": "StaNord_CMS", + "AbdeckungAlsTextAusOGDD": "", + "AlternativtextFuerDasBild": "", + "StANKategorie": "83", + "Beschreibung": "", + "Beziehungsverknuepfungen": "", + "OfflineDatum": "", + "ArtDerFormAusOGDD": "", + "Nummern": {"Nummer": ["Q I 3 - j/02 H"]}, + "DokumentID": "50625", + "Ansprechpartner": "", + "StANProdukte": ["106"], + "E-Aktenzeichen": "", + "ZeitlicheGranularitaetEinheit": "", + "VeroeffentlichendeStelle": {"Name": "Statistisches Amt für Hamburg und Schleswig-Holstein", + "EMailAdresse": "info@statistik-nord.de"}, + "Bild": "", + "WeitereInformationen": {"URL": "http://www.statistik-nord.de"}, + "AbdeckungInKoordinatenAusOGDD": "", + "Schlagwoerter": {"Schlagwort": ["Umwelt ", "Unfälle beim Umgang mit wassergefährdenden Stoffen und bei der Beförderung wassergefährdender Stoffe in Hamburg"]}, + "ZeitraumBis": "15.12.2002", + "Volltext": "", + "Namensnennung": "Freie und Hansestadt Hamburg, Statistisches Amt für Hamburg und Schleswig-Holstein", + "Titel": "Unfälle beim Umgang mit wassergefährdenden Stoffen und bei der Beförderung wassergefährdender Stoffe in Hamburg 2002", + "Infogegenstand": {"Bezeichnung": ["statistiken_IG7"]}, + "StANThemen": ["83"], + "Papieraktenzeichen": "", + "ZeitraumVon": "01.01.2002", + "MetadatenModellVersionsnummer": "", + "Ressourcen": {"Ressource": + [{"Dateigroesse": "0.046903610229492", + "DateinameInDerLieferung": "Q_I_3_j02_H.pdf", + "URLZumDownload": "https://www.statistik-nord.de/fileadmin/Dokumente/Statistische_Berichte/verkehr_umwelt_und_energie/Q_I_3_j_H/Q_I_3_j02_H.pdf", + "Ressourcenname": "Unfälle beim Umgang mit wassergefährdenden Stoffen und bei der Beförderung wassergefährdender Stoffe in Hamburg 2002", + "Format": {"FormatTyp": "pdf"}}]}, + "Nutzungsbestimmungen": {"ID_derLizenz": ["dl-de-by-2.0"]}, + "ZeitlicheGranularitaetIntervallDerErhebung": ""} \ No newline at end of file diff --git a/ckanext/odsh/harvesters/statistiknordharvester.py b/ckanext/odsh/harvesters/statistiknordharvester.py index 9978be0ec48443cc549f100e9f900d1bbc0c3490..4250c305c07e79dde70c2a4d01c9529d4e6e511b 100755 --- a/ckanext/odsh/harvesters/statistiknordharvester.py +++ b/ckanext/odsh/harvesters/statistiknordharvester.py @@ -1,31 +1,35 @@ -import urllib import urllib2 -import httplib -import datetime -import socket import traceback + from lxml import etree import uuid +import logging -from ckan import model, logic -from ckan.logic import ValidationError, NotFound, get_action +from ckan import model +from ckan.logic import get_action from ckan.lib.helpers import json -from ckan.lib.munge import munge_name -from ckan.lib.navl.validators import not_empty + from ckan.plugins import toolkit -from ckanext.harvest.model import HarvestObject, HarvestJob +from ckanext.harvest.model import HarvestObject from ckanext.harvest.harvesters.base import HarvesterBase from ckanext.odsh.model.statistiknord import * -import logging + +# Config filter defines mapping for statistikamt nord - to ckan fields as a string +from config_stat_amt_nord import config_filter +# Number mapper for mapping form statistikamt nord number to MDR data-theme authority codes +from config_stat_amt_nord import numbers + +from ckanext.odsh.harvesters.ckan_mapper import pyjq_mapper + log = logging.getLogger(__name__) class StatistikNordHarvester(HarvesterBase): - ''' + """ A Harvester for Statistikamt Nord - ''' + """ def info(self): return { @@ -43,7 +47,7 @@ class StatistikNordHarvester(HarvesterBase): documents_list = self.get_documents_from_content(fetched_documents) documents = documents_list['RegistereintragsListe'] except Exception, e: - log.error('traceback while reading model: %s' % traceback.format_exc()) + # log.error('traceback while reading model: %s' % traceback.format_exc()) self._save_gather_error('Statistik-Nord-Harvester: Error while reading model [%r]' % e, harvest_job) return False @@ -54,7 +58,7 @@ class StatistikNordHarvester(HarvesterBase): try: fetched_values = self.get_values_from_content(document) identifier = self._create_inforeg_id(fetched_values) - log.info('identifier: %s' % identifier) + # log.info('identifier: %s' % identifier) if identifier in used_identifiers: continue @@ -90,8 +94,8 @@ class StatistikNordHarvester(HarvesterBase): if len(ids) > 0: log.info( "finished %s IDs of %s IDs successfully gathered" % (len(used_identifiers), len(documents))) - log.debug("List of gathered IDs: %s" % ids) - log.debug("gather_stage() finished: %s IDs gathered" % len(ids)) + # log.debug("List of gathered IDs: %s" % ids) + # log.debug("gather_stage() finished: %s IDs gathered" % len(ids)) return ids else: log.error("No records received") @@ -108,7 +112,8 @@ class StatistikNordHarvester(HarvesterBase): 'session': model.Session, 'user': self._get_user_name(), } - log.debug("user: " + self._get_user_name()) + + # log.debug("user: " + self._get_user_name()) if not harvest_object: log.error('Statistik-Nord-Harvester: No harvest object received') return False @@ -117,68 +122,32 @@ class StatistikNordHarvester(HarvesterBase): self._save_object_error('Empty content for object %s' % harvest_object.id, harvest_object, u'Import') return False else: - values = json.loads(harvest_object.content) - package_dict = dict() - package_dict.update({'resources': [], 'tags': []}) - package_dict.update({'title': values['Titel']}) - package_dict.update({'notes': values['Beschreibung']}) - package_dict.update({'license_id': values['Nutzungsbestimmungen']['ID_derLizenz'][0]}) - package_dict.update({'author': values["VeroeffentlichendeStelle"]["Name"]}) - package_dict.update({'author_email': values["VeroeffentlichendeStelle"]["EMailAdresse"]}) - - extras = list() - extras.append({'key': 'identifier', 'value': self._create_inforeg_id(values)}) - package_dict['extras'] = extras - - if values['Ansprechpartner']: - package_dict.update({'maintainer': values['Ansprechpartner']['Name'], - 'maintainer_email': values['Ansprechpartner']['EMailAdresse']}) - try: - package_dict['url'] = values['WeitereInformationen']['URL'] - except KeyError: - package_dict['url'] = "" - - package_dict.update({'type': 'dataset'}) - - resources = values['Ressourcen']['Ressource'] - - for resource in resources: - resource_dict = dict() - resource_dict['name'] = resource['Ressourcenname'] - resource_dict['format'] = resource['Format'].get('FormatTyp', "") - resource_dict['url'] = resource['URLZumDownload'] - if resource['Dateigroesse'] == "0" or len(resource['Dateigroesse']) == 0: - resource_file = urllib2.urlopen(resource['url']) - resource_dict['file_size'] = resource_file['Content-Length'] - else: - file_size = int(round(float(resource['Dateigroesse']) * 1000000)) - resource_dict['file_size'] = file_size - package_dict['resources'].append(resource_dict) - - tags = values['Schlagwoerter']['Schlagwort'] - for tag in tags: - seperated_tags = tag.split(',') - for seperated_tag in seperated_tags: - if seperated_tag != '' and len(seperated_tag) < 100: - package_dict['tags'].append({'name': seperated_tag.strip()}) - - source_dataset = get_action('package_show')(context.copy(), {'id': harvest_object.source.id}) - package_dict['owner_org'] = source_dataset.get('owner_org') - - package_dict['id'] = str(uuid.uuid4()) - - log.debug(json.dumps(package_dict)) - try: - result = self._create_or_update_package(package_dict, harvest_object, package_dict_form='package_show') - return result - except toolkit.ValidationError, e: - self._save_object_error('Validation Error: %s' % str(e.error_summary), harvest_object, 'Import') - return False + self.dcat_mapper(context, harvest_object) + return True + + # A mapper method that maps the content of the harvested object onto the CKAN dataset fields + def dcat_mapper(self, context, harvest_object): + values = json.loads(harvest_object.content) + + # Use the pyjq lib for the default field mapping + package = pyjq_mapper(config_filter, values, numbers) + + # Add some meta data that is not part of the harvested_object + source_dataset = get_action('package_show')(context.copy(), {'id': harvest_object.source.id}) + package['owner_org'] = source_dataset.get('owner_org') + package['id'] = str(uuid.uuid4()) + package_dict = dict(package) + try: + result = self._create_or_update_package(package_dict, harvest_object, package_dict_form='package_show') + return result + except toolkit.ValidationError, e: + self._save_object_error('Validation Error: %s' % str(e.error_summary), harvest_object, 'Import') + return False @staticmethod def _get_content(url): url = url.replace(' ', '%20') - log.debug("get_content StatistikNord harvester: %s" % url) + # log.debug("get_content StatistikNord harvester: %s" % url) try: http_response = urllib2.urlopen(url, timeout=100000) content = http_response.read() @@ -192,9 +161,7 @@ class StatistikNordHarvester(HarvesterBase): def get_documents_from_content(content): fetched_xml = etree.fromstring(content) fetched_string = etree.tostring(fetched_xml) - fetched_document = StatistikNordDocuments(fetched_string) - fetched_values = fetched_document.read_values() return fetched_values @@ -216,23 +183,6 @@ class StatistikNordHarvester(HarvesterBase): else: return quelle + ':' + guid.strip() - def add_groups_to_fetched_values(self, fetched_values): - groups = [] - if 'StANProdukte' in fetched_values and '4' in fetched_values['StANProdukte']: - log.debug("Get Groups from database") - groups = self.get_all_groups() - #else: - # if 'StANThemen' in fetched_values: - # groups = self.translate_group(fetched_values['StANThemen']) - - fetched_values['Kategorie'] = {} - fetched_values['Kategorie']['NameDerKategorie'] = [] - if groups: - fetched_values['Kategorie']['NameDerKategorie'] = groups - - log.debug(fetched_values['Kategorie']['NameDerKategorie']) - return fetched_values - @staticmethod def get_all_groups(): result_groups = [] diff --git a/ckanext/odsh/harvesters/statistiknordharvester_old.py b/ckanext/odsh/harvesters/statistiknordharvester_old.py new file mode 100755 index 0000000000000000000000000000000000000000..87093f91f12790185f36fb6e30f7f0e053fae5f2 --- /dev/null +++ b/ckanext/odsh/harvesters/statistiknordharvester_old.py @@ -0,0 +1,303 @@ +import urllib2 +import traceback + +from ckanext.odsh.harvesters.ckan_mapper import pyjq_mapper +from lxml import etree +import uuid + +from ckan import model +from ckan.logic import get_action +from ckan.lib.helpers import json + +from ckan.plugins import toolkit + +from ckanext.harvest.model import HarvestObject +from ckanext.harvest.harvesters.base import HarvesterBase + +from ckanext.odsh.model.statistiknord import * +import logging + +log = logging.getLogger(__name__) + + +class StatistikNordHarvester(HarvesterBase): + ''' + A Harvester for Statistikamt Nord + ''' + + def info(self): + return { + 'name': 'statistik-nord', + 'title': 'Statistik Nord', + 'description': 'Harvests Statistikamt Nord', + 'form_config_interface': 'Text' + } + + def gather_stage(self, harvest_job): + url = harvest_job.source.url + + try: + fetched_documents = self._get_content(url) + documents_list = self.get_documents_from_content(fetched_documents) + documents = documents_list['RegistereintragsListe'] + except Exception, e: + #log.error('traceback while reading model: %s' % traceback.format_exc()) + self._save_gather_error('Statistik-Nord-Harvester: Error while reading model [%r]' % e, harvest_job) + return False + + try: + used_identifiers = [] + ids = [] + for document in documents: + try: + fetched_values = self.get_values_from_content(document) + identifier = self._create_inforeg_id(fetched_values) + #log.info('identifier: %s' % identifier) + + if identifier in used_identifiers: + continue + + if identifier is None: + log.error("ID: unknown - gather process failed ") + continue + + if identifier: + obj = HarvestObject(guid=identifier, + job=harvest_job) + obj.content = json.dumps(fetched_values) + obj.save() + log.info( + "harvest_object_id: %s, GUID: %s successfully gathered " % (str(obj.id), str(obj.guid))) + used_identifiers.append(identifier) + ids.append(obj.id) + log.debug('Save identifier %s from Statistik Nord' % identifier) + + except Exception, e: + log.error('traceback: %s' % traceback.format_exc()) + self._save_gather_error( + 'Statistik-Nord-Harvester: Error for the identifier %s [%r]' % (identifier, e), harvest_job) + continue + + except Exception, e: + self._save_gather_error( + 'Statistik-Nord-Harvester: Error gathering the identifiers from the source server [%s]' % str(e), + harvest_job) + log.error(e) + return None + + if len(ids) > 0: + #log.info( + # "finished %s IDs of %s IDs successfully gathered" % (len(used_identifiers), len(documents))) + #log.debug("List of gathered IDs: %s" % ids) + #log.debug("gather_stage() finished: %s IDs gathered" % len(ids)) + return ids + else: + #log.error("No records received") + self._save_gather_error("Couldn't find any metadata files", harvest_job) + return None + + @staticmethod + def fetch_stage(harvest_object): + return True + + def import_stage(self, harvest_object): + context = { + 'model': model, + 'session': model.Session, + 'user': self._get_user_name(), + } + #log.debug("user: " + self._get_user_name()) + if not harvest_object: + log.error('Statistik-Nord-Harvester: No harvest object received') + return False + + if harvest_object.content is None: + self._save_object_error('Empty content for object %s' % harvest_object.id, harvest_object, u'Import') + return False + else: + self.use_mapper(context, harvest_object) + #self.original_method(context, harvest_object) + + return True + + def use_mapper(self, context, harvest_object): + values = json.loads(harvest_object.content) + package = dict() + + package = pyjq_mapper(values) + source_dataset = get_action('package_show')(context.copy(), {'id': harvest_object.source.id}) + package['owner_org'] = source_dataset.get('owner_org') + package['id'] = str(uuid.uuid4()) + + package_dict = dict(package) + log.debug("license_id: " + str(package_dict["license_id"])) + log.debug("extras: " + str(package_dict["extras"])) + if package_dict.has_key("maintainer"): + log.debug("maintainer: " + str(package_dict["maintainer"])) + log.debug("url: " + str(package_dict["url"])) + log.debug("resources/file_size: " + str(package_dict["resources"])) + log.debug("tags: " + str(package_dict["tags"])) + + try: + result = self._create_or_update_package(package_dict, harvest_object, package_dict_form='package_show') + return result + except toolkit.ValidationError, e: + self._save_object_error('Validation Error: %s' % str(e.error_summary), harvest_object, 'Import') + return False + + def original_method(self, context, harvest_object): + values = json.loads(harvest_object.content) + #log.debug(values) + package_dict = dict() + package_dict.update({'resources': [], 'tags': [], 'groups': []}) + package_dict.update({'title': values['Titel']}) + package_dict.update({'notes': values['Beschreibung']}) + package_dict.update({'license_id': values['Nutzungsbestimmungen']['ID_derLizenz'][0]}) + package_dict.update({'author': values["VeroeffentlichendeStelle"]["Name"]}) + package_dict.update({'author_email': values["VeroeffentlichendeStelle"]["EMailAdresse"]}) + extras = list() + extras.append({'key': 'identifier', 'value': self._create_inforeg_id(values)}) + package_dict['extras'] = extras + if values['Ansprechpartner']: + package_dict.update({'maintainer': values['Ansprechpartner']['Name'], + 'maintainer_email': values['Ansprechpartner']['EMailAdresse']}) + try: + package_dict['url'] = values['WeitereInformationen']['URL'] + except KeyError: + package_dict['url'] = "" + package_dict.update({'type': 'dataset'}) + resources = values['Ressourcen']['Ressource'] + for resource in resources: + resource_dict = dict() + resource_dict['name'] = resource['Ressourcenname'] + resource_dict['format'] = resource['Format'].get('FormatTyp', "") + resource_dict['url'] = resource['URLZumDownload'] + if resource['Dateigroesse'] == "0" or len(resource['Dateigroesse']) == 0: + resource_file = urllib2.urlopen(resource['url']) + resource_dict['file_size'] = resource_file['Content-Length'] + else: + file_size = int(round(float(resource['Dateigroesse']) * 1000000)) + resource_dict['file_size'] = file_size + package_dict['resources'].append(resource_dict) + tags = values['Schlagwoerter']['Schlagwort'] + # + for tag in tags: + seperated_tags = tag.split(',') + for seperated_tag in seperated_tags: + if seperated_tag != '' and len(seperated_tag) < 100: + package_dict['tags'].append({'name': seperated_tag.strip()}) + + self.map_to_group(package_dict, values) + ## How To?: + source_dataset = get_action('package_show')(context.copy(), {'id': harvest_object.source.id}) + package_dict['owner_org'] = source_dataset.get('owner_org') + package_dict['id'] = str(uuid.uuid4()) + log.debug("license_id: " + str(package_dict["license_id"])) + log.debug("extras: " + str(package_dict["extras"])) + if package_dict.has_key("maintainer"): + log.debug("maintainer: " + str(package_dict["maintainer"])) + log.debug("url: " + str(package_dict["url"])) + log.debug("resource/file_size: " + str(package_dict["resources"])) + log.debug("tags: " + str(package_dict["tags"])) + + try: + result = self._create_or_update_package(package_dict, harvest_object, package_dict_form='package_show') + return result + except toolkit.ValidationError, e: + self._save_object_error('Validation Error: %s' % str(e.error_summary), harvest_object, 'Import') + return False + + def map_to_group(self, package_dict, values): + # open file with the mapping from numbers to DCAT-DE vocabulary: + with open('/usr/lib/ckan/default/src/ckanext-odsh/ckanext/odsh/harvesters/number_dcat_de.json') as f: + dcat_theme = json.load(f) + # get the code + code = values['StANKategorie'] + # if possible map it to a group + if dcat_theme.has_key(str(code)): + for item in dcat_theme[str(code)]: + package_dict['groups'].append({'name': item}) + #log.debug("DEBUG: DCAT-DE Code Mapping from %s to %s", str(code), item) + + @staticmethod + def _get_content(url): + url = url.replace(' ', '%20') + #log.debug("get_content StatistikNord harvester: %s" % url) + try: + http_response = urllib2.urlopen(url, timeout=100000) + content = http_response.read() + return content + except Exception, e: + log.error('traceback WebHarvester could not get content!: %s' % traceback.format_exc()) + log.debug("Error in _get_content %s" % e) + raise e + + @staticmethod + def get_documents_from_content(content): + fetched_xml = etree.fromstring(content) + fetched_string = etree.tostring(fetched_xml) + + fetched_document = StatistikNordDocuments(fetched_string) + + fetched_values = fetched_document.read_values() + return fetched_values + + @staticmethod + def get_values_from_content(content): + fetched_xml = etree.fromstring(content) + fetched_string = etree.tostring(fetched_xml) + fetched_document = StatistikNordDocument(fetched_string) + fetched_values = fetched_document.read_values() + + return fetched_values + + @staticmethod + def _create_inforeg_id(values): + guid = values['DokumentID'] + quelle = values['Quelle'] + if guid.startswith(quelle): + return guid.strip() + else: + return quelle + ':' + guid.strip() + + def add_groups_to_fetched_values(self, fetched_values): + groups = [] + if 'StANProdukte' in fetched_values and '4' in fetched_values['StANProdukte']: + #log.debug("Get Groups from database") + groups = self.get_all_groups() + #else: + # if 'StANThemen' in fetched_values: + # groups = self.translate_group(fetched_values['StANThemen']) + + fetched_values['Kategorie'] = {} + fetched_values['Kategorie']['NameDerKategorie'] = [] + if groups: + fetched_values['Kategorie']['NameDerKategorie'] = groups + + #log.debug(fetched_values['Kategorie']['NameDerKategorie']) + return fetched_values + + @staticmethod + def get_all_groups(): + result_groups = [] + groups_in_database = model.Session.query(model.Group.name).filter(model.Group.state == 'active') + for group_in_database in groups_in_database.all(): + result_groups.append(group_in_database.name) + + return result_groups + + +class ContentFetchError(Exception): + pass + + +class ContentNotFoundError(ContentFetchError): + pass + + +class RemoteResourceError(Exception): + pass + + +class SearchError(Exception): + pass diff --git a/ckanext/odsh/tests/result_data.json b/ckanext/odsh/tests/result_data.json new file mode 100644 index 0000000000000000000000000000000000000000..0619319c0420b9b4390a30517ba5ea3eba980776 --- /dev/null +++ b/ckanext/odsh/tests/result_data.json @@ -0,0 +1,61 @@ +{ + "title": "Unfälle beim Umgang mit wassergefährdenden Stoffen und bei der Beförderung wassergefährdender Stoffe in Hamburg 2002", + "url": "http://www.statistik-nord.de", + "notes": "", + "author": "Statistisches Amt für Hamburg und Schleswig-Holstein", + "author_email": "info@statistik-nord.de", + "extras": [ + { + "key": "identifier", + "value": "StaNord_CMS:50625" + }, + { + "key": "issued", + "value": null + }, + { + "key": "modified", + "value": null + }, + { + "key" : "temporal_start", + "value" : "" + }, + { + "key" : "temporal_end", + "value" : "" + }, + { + "key" : "theme", + "value" :[] + } + + ], + "groups": [ + { + "name": "tran" + }, + { + "name": "envi" + }, + { + "name": "ener" + } + ], + "license_id":"dl-by-de/2.0", + "tags": [ + { + "name": "Umwelt " + } + ], + "type": "dataset", + "resources": [ + { + "url": "https://www.statistik-nord.de/fileadmin/Dokumente/Statistische_Berichte/verkehr_umwelt_und_energie/Q_I_3_j_H/Q_I_3_j02_H.pdf", + "format": "pdf", + "name": "Unfälle beim Umgang mit wassergefährdenden Stoffen und bei der Beförderung wassergefährdender Stoffe in Hamburg 2002", + "size": 46903, + "license" : "dl-by-de/2.0" + } + ] +} \ No newline at end of file diff --git a/ckanext/odsh/tests/test_ckan_mapper.py b/ckanext/odsh/tests/test_ckan_mapper.py new file mode 100644 index 0000000000000000000000000000000000000000..847a8e9bcde81de0775bbefa15f546a6c1a14410 --- /dev/null +++ b/ckanext/odsh/tests/test_ckan_mapper.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python +import json +import unittest + +from ckanext.odsh.harvesters import ckan_mapper as mapper + + +def value_error(): + raise ValueError('Config string can not be empty.') + +class TestMappingMethodsStatistikamtNord(unittest.TestCase): + + # Files to load + with open('/usr/lib/ckan/default/src/ckanext-odsh/ckanext/odsh/tests/test_data.json') as f: + input_data = json.load(f) + with open('/usr/lib/ckan/default/src/ckanext-odsh/ckanext/odsh/tests/result_data.json') as f: + expected_result = json.load(f) + + # Loads statistikamt nord specific configurations + config_filter = mapper.sta_amt_nord.config_filter + numbers = mapper.sta_amt_nord.numbers + + + # Test pyjq mapper with invalid StANKategorie + def test_pyjq_mapper_with_invalid_config_string(self): + + # Save the data for generating new input for the test + test_dict = self.input_data + + # Arrange + self.config_filter = "" + # Act + + # Assert + with self.assertRaises(ValueError): + mapper.pyjq_mapper(self.config_filter, test_dict, self.numbers) + value_error + + + + # Test pyjq mapper with invalid StANKategorie + def test_pyjq_mapper_with_invalid_StANKategorie(self): + + # Save the data for generating new input for the test + test_dict = self.input_data + + # Arrange + # Set category to a value, that can not be mapped + test_dict["StANKategorie"] = unicode(0) + # Set the expected result data correctly + self.expected_result["groups"] = [] + + # Act + package = mapper.pyjq_mapper(self.config_filter, test_dict, self.numbers) + + # Assert + self.assertEqual(package, self.expected_result) + + +if __name__ == '__main__': + unittest.main() diff --git a/ckanext/odsh/tests/test_data.json b/ckanext/odsh/tests/test_data.json new file mode 100644 index 0000000000000000000000000000000000000000..c5ad278f4ac9d1f52ae8cffe061da48c19c99267 --- /dev/null +++ b/ckanext/odsh/tests/test_data.json @@ -0,0 +1,39 @@ +{"TypDesInhalts": "dokument", + "Quelle": "StaNord_CMS", + "AbdeckungAlsTextAusOGDD": "", + "AlternativtextFuerDasBild": "", + "StANKategorie": "83", + "Beschreibung": "", + "Beziehungsverknuepfungen": "", + "OfflineDatum": "", + "ArtDerFormAusOGDD": "", + "Nummern": {"Nummer": ["Q I 3 - j/02 H"]}, + "DokumentID": "50625", + "Ansprechpartner": "", + "StANProdukte": ["106"], + "E-Aktenzeichen": "", + "ZeitlicheGranularitaetEinheit": "", + "VeroeffentlichendeStelle": {"Name": "Statistisches Amt für Hamburg und Schleswig-Holstein", + "EMailAdresse": "info@statistik-nord.de"}, + "Bild": "", + "WeitereInformationen": {"URL": "http://www.statistik-nord.de"}, + "AbdeckungInKoordinatenAusOGDD": "", + "Schlagwoerter": {"Schlagwort": ["Umwelt ", + "Unfälle beim Umgang mit wassergefährdenden Stoffen und bei der Beförderung wassergefährdender Stoffe in Hamburg"]}, + "ZeitraumBis": "", + "Volltext": "", + "Namensnennung": "Freie und Hansestadt Hamburg, Statistisches Amt für Hamburg und Schleswig-Holstein", + "Titel": "Unfälle beim Umgang mit wassergefährdenden Stoffen und bei der Beförderung wassergefährdender Stoffe in Hamburg 2002", + "Infogegenstand": {"Bezeichnung": ["statistiken_IG7"]}, + "StANThemen": ["83"], + "Papieraktenzeichen": "", + "ZeitraumVon": "", + "MetadatenModellVersionsnummer": "", + "Ressourcen": {"Ressource": + [{"Dateigroesse": "0.046903610229492", + "DateinameInDerLieferung": "Q_I_3_j02_H.pdf", + "URLZumDownload": "https://www.statistik-nord.de/fileadmin/Dokumente/Statistische_Berichte/verkehr_umwelt_und_energie/Q_I_3_j_H/Q_I_3_j02_H.pdf", + "Ressourcenname": "Unfälle beim Umgang mit wassergefährdenden Stoffen und bei der Beförderung wassergefährdender Stoffe in Hamburg 2002", + "Format": {"FormatTyp": "pdf"}}]}, + "Nutzungsbestimmungen": {"ID_derLizenz": ["dl-de-by-2.0"]}, + "ZeitlicheGranularitaetIntervallDerErhebung": ""} diff --git a/requirements.txt b/requirements.txt index 6042e0e84bc674a8af3af2141a129f626d75a1c8..ae27d47ed78980f55e486895c69f474f858f7508 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ ckan ckanext-harvest ckanext-spatial -lxml \ No newline at end of file +lxml +pyjq \ No newline at end of file