Skip to content
Snippets Groups Projects
Commit e626c166 authored by anonymous's avatar anonymous
Browse files

Merge branch 'master' of race.informatik.uni-hamburg.de:odsh/ckanext-odsh

parents 73a851de 2ba2b431
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python
"""
Mapper, that loads all necessary mapping files, calls the pyjq method and returns valid ckan-formatted values
"""
import json
import pyjq as pyjq
# import the "stat_amt_nord" configuration file to determine
# 1st the mapping from statistikamt nord fields onto ckan-fields and
# 2nd the mapping form the category numbers onto the MDR data-theme authority codes
import config_stat_amt_nord as sta_amt_nord
def pyjq_mapper(config_filter, value, numbers):
"""
:param config_filter: delivery system specific configuration string
:param value: input, to map onto the ckan format
:param numbers: delivery system specific mapping from numbers to MDR - authority codes
:return: valid ckan formatted value
"""
if config_filter == "":
raise ValueError('Config string can not be empty.')
else:
tmp = pyjq.all(config_filter, value, vars={"numbers": numbers})
# print "tmp cm: " + str(tmp)
return dict(tmp[0])
# for very little testing reasons regarding the pyjq lib only
test_config_filter = sta_amt_nord.config_filter
test_numbers = sta_amt_nord.numbers
#with open('/usr/lib/ckan/default/src/ckanext-odsh/ckanext/odsh/harvesters/statistik-nord-example_2.json') as f:
# input = json.load(f)
# print "Input: " + str(input)
# result = dict(pyjq_mapper(test_config_filter, input, test_numbers))
#print "Result: " + str(result)
#!/usr/bin/env python
"""
Some Variables needed for configuration of the ckan_mapper function.
It supplys a filter string needed by the pyjq-mapper and a dicitonary containing the statistikamt nord
category numbers and the corresponding MDR data-theme authority codes.
The config_filter string is build up like a json string.
To generate this string, visit https://stedolan.github.io/jq/manual/.
The keys are valid CKAN-Fields, and the values are the paths to
the corresponding values within the Statistikamt Nord datasets.
"""
config_filter = '{"author" : .VeroeffentlichendeStelle.Name,' \
'"url" : .WeitereInformationen.URL,' \
'"notes" : .Beschreibung,' \
'"title" : .Titel,' \
'"author_email": .VeroeffentlichendeStelle.EMailAdresse,' \
'"extras": [' \
' {"key": "identifier", "value": (.Quelle + ":" +.DokumentID)},' \
' {"key": "issued", "value": .Veroeffentlichungsdatum?},' \
' {"key": "modified", "value": .Aenderungsdatum?},' \
' {"key": "temporal_start", "value": .ZeitraumVon?},' \
' {"key": "temporal_end", "value": .ZeitraumBis?},' \
' ([$numbers[.StANKategorie][]?] | {"key":"theme", "value": .})],' \
'"resources" : [.Ressourcen.Ressource[] | {' \
' "url":.URLZumDownload,' \
' "size" : (.Dateigroesse | tonumber | . * 1000000 | floor),' \
' "name" : .Ressourcenname,' \
' "format": .Format.FormatTyp,' \
' "license": "dl-by-de/2.0"}],' \
'"license_id" : "dl-by-de/2.0",' \
'"type" : "dataset",' \
'"tags" : [.Schlagwoerter |.Schlagwort[] | gsub(", "; ",") | select(length > 0) |' \
' split(",") |.[] |.+ "" | if (. | length) <= 100 then {"name" : .} else empty end],' \
'"groups" : [$numbers[.StANKategorie][]? | {"name" : .}]}'
# This mapping should be exchanged by the official mapping list from statistikamt nord
numbers = {
"59" : ["econ", " agri"],
"56" : ["econ"],
"74" : ["econ"],
"48" : ["regi"],
"50" : ["regi"],
"68" : ["econ"],
"54" : ["econ"],
"116" : ["soci"],
"7" : ["soci"],
"9" : ["soci"],
"8" : ["soci"],
"93" : ["gove"],
"36" : ["educ"],
"75" : ["econ"],
"58" : ["tech"],
"90" : ["gove"],
"91" : ["gove"],
"73" : ["econ"],
"55" : ["econ"],
"31" : ["soci"],
"87" : ["ener", " envi"],
"60" : ["econ"],
"53" : ["econ"],
"52" : ["econ"],
"92" : ["gove"],
"10" : ["soci"],
"79" : ["gove", " econ"],
"72" : ["intr"],
"117" : ["regi"],
"49" : ["regi"],
"35" : ["regi"],
"43" : ["heal"],
"42" : ["heal", "soci"],
"64" : ["econ"],
"70" : ["intr"],
"69" : ["econ"],
"27" : ["soci"],
"28" : ["soci", "econ"],
"38" : ["educ"],
"108" : ["econ"],
"67" : ["econ"],
"66" : ["econ"],
"65" : ["econ"],
"46" : ["soci"],
"39" : ["intr"],
"57" : ["agri"],
"115" : ["agri"],
"80" : ["gove"],
"82" : ["gove"],
"47" : ["soci"],
"78" : ["soci"],
"40" : ["just"],
"85" : ["tran", "econ"],
"37" : ["educ"],
"45" : ["soci"],
"81" : ["gove"],
"84" : ["tran", "econ"],
"44" : ["heal"],
"71" : ["educ"],
"88" : ["envi"],
"86" : ["envi"],
"62" : ["econ"],
"63" : ["econ"],
"83" : ["tran", "envi", "ener"],
"77" : ["econ"],
"61" : ["econ"],
"98" : ["gove"],
"76" : ["econ"],
"89" : ["gove"],
"41" : ["educ"],
"51" : ["regi"],
"111" : ["soci", "regi"]
}
{"TypDesInhalts": "dokument",
"Quelle": "StaNord_CMS",
"AbdeckungAlsTextAusOGDD": "",
"AlternativtextFuerDasBild": "",
"StANKategorie": "83",
"Beschreibung": "",
"Beziehungsverknuepfungen": "",
"OfflineDatum": "",
"ArtDerFormAusOGDD": "",
"Nummern": {"Nummer": ["Q I 3 - j/02 H"]},
"DokumentID": "50625",
"Ansprechpartner": "",
"StANProdukte": ["106"],
"E-Aktenzeichen": "",
"ZeitlicheGranularitaetEinheit": "",
"VeroeffentlichendeStelle": {"Name": "Statistisches Amt für Hamburg und Schleswig-Holstein",
"EMailAdresse": "info@statistik-nord.de"},
"Bild": "",
"WeitereInformationen": {"URL": "http://www.statistik-nord.de"},
"AbdeckungInKoordinatenAusOGDD": "",
"Schlagwoerter": {"Schlagwort": ["Umwelt ", "Unfälle beim Umgang mit wassergefährdenden Stoffen und bei der Beförderung wassergefährdender Stoffe in Hamburg"]},
"ZeitraumBis": "15.12.2002",
"Volltext": "",
"Namensnennung": "Freie und Hansestadt Hamburg, Statistisches Amt für Hamburg und Schleswig-Holstein",
"Titel": "Unfälle beim Umgang mit wassergefährdenden Stoffen und bei der Beförderung wassergefährdender Stoffe in Hamburg 2002",
"Infogegenstand": {"Bezeichnung": ["statistiken_IG7"]},
"StANThemen": ["83"],
"Papieraktenzeichen": "",
"ZeitraumVon": "01.01.2002",
"MetadatenModellVersionsnummer": "",
"Ressourcen": {"Ressource":
[{"Dateigroesse": "0.046903610229492",
"DateinameInDerLieferung": "Q_I_3_j02_H.pdf",
"URLZumDownload": "https://www.statistik-nord.de/fileadmin/Dokumente/Statistische_Berichte/verkehr_umwelt_und_energie/Q_I_3_j_H/Q_I_3_j02_H.pdf",
"Ressourcenname": "Unfälle beim Umgang mit wassergefährdenden Stoffen und bei der Beförderung wassergefährdender Stoffe in Hamburg 2002",
"Format": {"FormatTyp": "pdf"}}]},
"Nutzungsbestimmungen": {"ID_derLizenz": ["dl-de-by-2.0"]},
"ZeitlicheGranularitaetIntervallDerErhebung": ""}
\ No newline at end of file
import urllib
import urllib2
import httplib
import datetime
import socket
import traceback
from lxml import etree
import uuid
import logging
from ckan import model, logic
from ckan.logic import ValidationError, NotFound, get_action
from ckan import model
from ckan.logic import get_action
from ckan.lib.helpers import json
from ckan.lib.munge import munge_name
from ckan.lib.navl.validators import not_empty
from ckan.plugins import toolkit
from ckanext.harvest.model import HarvestObject, HarvestJob
from ckanext.harvest.model import HarvestObject
from ckanext.harvest.harvesters.base import HarvesterBase
from ckanext.odsh.model.statistiknord import *
import logging
# Config filter defines mapping for statistikamt nord - to ckan fields as a string
from config_stat_amt_nord import config_filter
# Number mapper for mapping form statistikamt nord number to MDR data-theme authority codes
from config_stat_amt_nord import numbers
from ckanext.odsh.harvesters.ckan_mapper import pyjq_mapper
log = logging.getLogger(__name__)
class StatistikNordHarvester(HarvesterBase):
'''
"""
A Harvester for Statistikamt Nord
'''
"""
def info(self):
return {
......@@ -43,7 +47,7 @@ class StatistikNordHarvester(HarvesterBase):
documents_list = self.get_documents_from_content(fetched_documents)
documents = documents_list['RegistereintragsListe']
except Exception, e:
log.error('traceback while reading model: %s' % traceback.format_exc())
# log.error('traceback while reading model: %s' % traceback.format_exc())
self._save_gather_error('Statistik-Nord-Harvester: Error while reading model [%r]' % e, harvest_job)
return False
......@@ -54,7 +58,7 @@ class StatistikNordHarvester(HarvesterBase):
try:
fetched_values = self.get_values_from_content(document)
identifier = self._create_inforeg_id(fetched_values)
log.info('identifier: %s' % identifier)
# log.info('identifier: %s' % identifier)
if identifier in used_identifiers:
continue
......@@ -90,8 +94,8 @@ class StatistikNordHarvester(HarvesterBase):
if len(ids) > 0:
log.info(
"finished %s IDs of %s IDs successfully gathered" % (len(used_identifiers), len(documents)))
log.debug("List of gathered IDs: %s" % ids)
log.debug("gather_stage() finished: %s IDs gathered" % len(ids))
# log.debug("List of gathered IDs: %s" % ids)
# log.debug("gather_stage() finished: %s IDs gathered" % len(ids))
return ids
else:
log.error("No records received")
......@@ -108,7 +112,8 @@ class StatistikNordHarvester(HarvesterBase):
'session': model.Session,
'user': self._get_user_name(),
}
log.debug("user: " + self._get_user_name())
# log.debug("user: " + self._get_user_name())
if not harvest_object:
log.error('Statistik-Nord-Harvester: No harvest object received')
return False
......@@ -117,57 +122,21 @@ class StatistikNordHarvester(HarvesterBase):
self._save_object_error('Empty content for object %s' % harvest_object.id, harvest_object, u'Import')
return False
else:
self.dcat_mapper(context, harvest_object)
return True
# A mapper method that maps the content of the harvested object onto the CKAN dataset fields
def dcat_mapper(self, context, harvest_object):
values = json.loads(harvest_object.content)
package_dict = dict()
package_dict.update({'resources': [], 'tags': []})
package_dict.update({'title': values['Titel']})
package_dict.update({'notes': values['Beschreibung']})
package_dict.update({'license_id': values['Nutzungsbestimmungen']['ID_derLizenz'][0]})
package_dict.update({'author': values["VeroeffentlichendeStelle"]["Name"]})
package_dict.update({'author_email': values["VeroeffentlichendeStelle"]["EMailAdresse"]})
extras = list()
extras.append({'key': 'identifier', 'value': self._create_inforeg_id(values)})
package_dict['extras'] = extras
if values['Ansprechpartner']:
package_dict.update({'maintainer': values['Ansprechpartner']['Name'],
'maintainer_email': values['Ansprechpartner']['EMailAdresse']})
try:
package_dict['url'] = values['WeitereInformationen']['URL']
except KeyError:
package_dict['url'] = ""
package_dict.update({'type': 'dataset'})
resources = values['Ressourcen']['Ressource']
for resource in resources:
resource_dict = dict()
resource_dict['name'] = resource['Ressourcenname']
resource_dict['format'] = resource['Format'].get('FormatTyp', "")
resource_dict['url'] = resource['URLZumDownload']
if resource['Dateigroesse'] == "0" or len(resource['Dateigroesse']) == 0:
resource_file = urllib2.urlopen(resource['url'])
resource_dict['file_size'] = resource_file['Content-Length']
else:
file_size = int(round(float(resource['Dateigroesse']) * 1000000))
resource_dict['file_size'] = file_size
package_dict['resources'].append(resource_dict)
tags = values['Schlagwoerter']['Schlagwort']
for tag in tags:
seperated_tags = tag.split(',')
for seperated_tag in seperated_tags:
if seperated_tag != '' and len(seperated_tag) < 100:
package_dict['tags'].append({'name': seperated_tag.strip()})
# Use the pyjq lib for the default field mapping
package = pyjq_mapper(config_filter, values, numbers)
# Add some meta data that is not part of the harvested_object
source_dataset = get_action('package_show')(context.copy(), {'id': harvest_object.source.id})
package_dict['owner_org'] = source_dataset.get('owner_org')
package_dict['id'] = str(uuid.uuid4())
log.debug(json.dumps(package_dict))
package['owner_org'] = source_dataset.get('owner_org')
package['id'] = str(uuid.uuid4())
package_dict = dict(package)
try:
result = self._create_or_update_package(package_dict, harvest_object, package_dict_form='package_show')
return result
......@@ -178,7 +147,7 @@ class StatistikNordHarvester(HarvesterBase):
@staticmethod
def _get_content(url):
url = url.replace(' ', '%20')
log.debug("get_content StatistikNord harvester: %s" % url)
# log.debug("get_content StatistikNord harvester: %s" % url)
try:
http_response = urllib2.urlopen(url, timeout=100000)
content = http_response.read()
......@@ -192,9 +161,7 @@ class StatistikNordHarvester(HarvesterBase):
def get_documents_from_content(content):
fetched_xml = etree.fromstring(content)
fetched_string = etree.tostring(fetched_xml)
fetched_document = StatistikNordDocuments(fetched_string)
fetched_values = fetched_document.read_values()
return fetched_values
......@@ -216,23 +183,6 @@ class StatistikNordHarvester(HarvesterBase):
else:
return quelle + ':' + guid.strip()
def add_groups_to_fetched_values(self, fetched_values):
groups = []
if 'StANProdukte' in fetched_values and '4' in fetched_values['StANProdukte']:
log.debug("Get Groups from database")
groups = self.get_all_groups()
#else:
# if 'StANThemen' in fetched_values:
# groups = self.translate_group(fetched_values['StANThemen'])
fetched_values['Kategorie'] = {}
fetched_values['Kategorie']['NameDerKategorie'] = []
if groups:
fetched_values['Kategorie']['NameDerKategorie'] = groups
log.debug(fetched_values['Kategorie']['NameDerKategorie'])
return fetched_values
@staticmethod
def get_all_groups():
result_groups = []
......
import urllib2
import traceback
from ckanext.odsh.harvesters.ckan_mapper import pyjq_mapper
from lxml import etree
import uuid
from ckan import model
from ckan.logic import get_action
from ckan.lib.helpers import json
from ckan.plugins import toolkit
from ckanext.harvest.model import HarvestObject
from ckanext.harvest.harvesters.base import HarvesterBase
from ckanext.odsh.model.statistiknord import *
import logging
log = logging.getLogger(__name__)
class StatistikNordHarvester(HarvesterBase):
'''
A Harvester for Statistikamt Nord
'''
def info(self):
return {
'name': 'statistik-nord',
'title': 'Statistik Nord',
'description': 'Harvests Statistikamt Nord',
'form_config_interface': 'Text'
}
def gather_stage(self, harvest_job):
url = harvest_job.source.url
try:
fetched_documents = self._get_content(url)
documents_list = self.get_documents_from_content(fetched_documents)
documents = documents_list['RegistereintragsListe']
except Exception, e:
#log.error('traceback while reading model: %s' % traceback.format_exc())
self._save_gather_error('Statistik-Nord-Harvester: Error while reading model [%r]' % e, harvest_job)
return False
try:
used_identifiers = []
ids = []
for document in documents:
try:
fetched_values = self.get_values_from_content(document)
identifier = self._create_inforeg_id(fetched_values)
#log.info('identifier: %s' % identifier)
if identifier in used_identifiers:
continue
if identifier is None:
log.error("ID: unknown - gather process failed ")
continue
if identifier:
obj = HarvestObject(guid=identifier,
job=harvest_job)
obj.content = json.dumps(fetched_values)
obj.save()
log.info(
"harvest_object_id: %s, GUID: %s successfully gathered " % (str(obj.id), str(obj.guid)))
used_identifiers.append(identifier)
ids.append(obj.id)
log.debug('Save identifier %s from Statistik Nord' % identifier)
except Exception, e:
log.error('traceback: %s' % traceback.format_exc())
self._save_gather_error(
'Statistik-Nord-Harvester: Error for the identifier %s [%r]' % (identifier, e), harvest_job)
continue
except Exception, e:
self._save_gather_error(
'Statistik-Nord-Harvester: Error gathering the identifiers from the source server [%s]' % str(e),
harvest_job)
log.error(e)
return None
if len(ids) > 0:
#log.info(
# "finished %s IDs of %s IDs successfully gathered" % (len(used_identifiers), len(documents)))
#log.debug("List of gathered IDs: %s" % ids)
#log.debug("gather_stage() finished: %s IDs gathered" % len(ids))
return ids
else:
#log.error("No records received")
self._save_gather_error("Couldn't find any metadata files", harvest_job)
return None
@staticmethod
def fetch_stage(harvest_object):
return True
def import_stage(self, harvest_object):
context = {
'model': model,
'session': model.Session,
'user': self._get_user_name(),
}
#log.debug("user: " + self._get_user_name())
if not harvest_object:
log.error('Statistik-Nord-Harvester: No harvest object received')
return False
if harvest_object.content is None:
self._save_object_error('Empty content for object %s' % harvest_object.id, harvest_object, u'Import')
return False
else:
self.use_mapper(context, harvest_object)
#self.original_method(context, harvest_object)
return True
def use_mapper(self, context, harvest_object):
values = json.loads(harvest_object.content)
package = dict()
package = pyjq_mapper(values)
source_dataset = get_action('package_show')(context.copy(), {'id': harvest_object.source.id})
package['owner_org'] = source_dataset.get('owner_org')
package['id'] = str(uuid.uuid4())
package_dict = dict(package)
log.debug("license_id: " + str(package_dict["license_id"]))
log.debug("extras: " + str(package_dict["extras"]))
if package_dict.has_key("maintainer"):
log.debug("maintainer: " + str(package_dict["maintainer"]))
log.debug("url: " + str(package_dict["url"]))
log.debug("resources/file_size: " + str(package_dict["resources"]))
log.debug("tags: " + str(package_dict["tags"]))
try:
result = self._create_or_update_package(package_dict, harvest_object, package_dict_form='package_show')
return result
except toolkit.ValidationError, e:
self._save_object_error('Validation Error: %s' % str(e.error_summary), harvest_object, 'Import')
return False
def original_method(self, context, harvest_object):
values = json.loads(harvest_object.content)
#log.debug(values)
package_dict = dict()
package_dict.update({'resources': [], 'tags': [], 'groups': []})
package_dict.update({'title': values['Titel']})
package_dict.update({'notes': values['Beschreibung']})
package_dict.update({'license_id': values['Nutzungsbestimmungen']['ID_derLizenz'][0]})
package_dict.update({'author': values["VeroeffentlichendeStelle"]["Name"]})
package_dict.update({'author_email': values["VeroeffentlichendeStelle"]["EMailAdresse"]})
extras = list()
extras.append({'key': 'identifier', 'value': self._create_inforeg_id(values)})
package_dict['extras'] = extras
if values['Ansprechpartner']:
package_dict.update({'maintainer': values['Ansprechpartner']['Name'],
'maintainer_email': values['Ansprechpartner']['EMailAdresse']})
try:
package_dict['url'] = values['WeitereInformationen']['URL']
except KeyError:
package_dict['url'] = ""
package_dict.update({'type': 'dataset'})
resources = values['Ressourcen']['Ressource']
for resource in resources:
resource_dict = dict()
resource_dict['name'] = resource['Ressourcenname']
resource_dict['format'] = resource['Format'].get('FormatTyp', "")
resource_dict['url'] = resource['URLZumDownload']
if resource['Dateigroesse'] == "0" or len(resource['Dateigroesse']) == 0:
resource_file = urllib2.urlopen(resource['url'])
resource_dict['file_size'] = resource_file['Content-Length']
else:
file_size = int(round(float(resource['Dateigroesse']) * 1000000))
resource_dict['file_size'] = file_size
package_dict['resources'].append(resource_dict)
tags = values['Schlagwoerter']['Schlagwort']
#
for tag in tags:
seperated_tags = tag.split(',')
for seperated_tag in seperated_tags:
if seperated_tag != '' and len(seperated_tag) < 100:
package_dict['tags'].append({'name': seperated_tag.strip()})
self.map_to_group(package_dict, values)
## How To?:
source_dataset = get_action('package_show')(context.copy(), {'id': harvest_object.source.id})
package_dict['owner_org'] = source_dataset.get('owner_org')
package_dict['id'] = str(uuid.uuid4())
log.debug("license_id: " + str(package_dict["license_id"]))
log.debug("extras: " + str(package_dict["extras"]))
if package_dict.has_key("maintainer"):
log.debug("maintainer: " + str(package_dict["maintainer"]))
log.debug("url: " + str(package_dict["url"]))
log.debug("resource/file_size: " + str(package_dict["resources"]))
log.debug("tags: " + str(package_dict["tags"]))
try:
result = self._create_or_update_package(package_dict, harvest_object, package_dict_form='package_show')
return result
except toolkit.ValidationError, e:
self._save_object_error('Validation Error: %s' % str(e.error_summary), harvest_object, 'Import')
return False
def map_to_group(self, package_dict, values):
# open file with the mapping from numbers to DCAT-DE vocabulary:
with open('/usr/lib/ckan/default/src/ckanext-odsh/ckanext/odsh/harvesters/number_dcat_de.json') as f:
dcat_theme = json.load(f)
# get the code
code = values['StANKategorie']
# if possible map it to a group
if dcat_theme.has_key(str(code)):
for item in dcat_theme[str(code)]:
package_dict['groups'].append({'name': item})
#log.debug("DEBUG: DCAT-DE Code Mapping from %s to %s", str(code), item)
@staticmethod
def _get_content(url):
url = url.replace(' ', '%20')
#log.debug("get_content StatistikNord harvester: %s" % url)
try:
http_response = urllib2.urlopen(url, timeout=100000)
content = http_response.read()
return content
except Exception, e:
log.error('traceback WebHarvester could not get content!: %s' % traceback.format_exc())
log.debug("Error in _get_content %s" % e)
raise e
@staticmethod
def get_documents_from_content(content):
fetched_xml = etree.fromstring(content)
fetched_string = etree.tostring(fetched_xml)
fetched_document = StatistikNordDocuments(fetched_string)
fetched_values = fetched_document.read_values()
return fetched_values
@staticmethod
def get_values_from_content(content):
fetched_xml = etree.fromstring(content)
fetched_string = etree.tostring(fetched_xml)
fetched_document = StatistikNordDocument(fetched_string)
fetched_values = fetched_document.read_values()
return fetched_values
@staticmethod
def _create_inforeg_id(values):
guid = values['DokumentID']
quelle = values['Quelle']
if guid.startswith(quelle):
return guid.strip()
else:
return quelle + ':' + guid.strip()
def add_groups_to_fetched_values(self, fetched_values):
groups = []
if 'StANProdukte' in fetched_values and '4' in fetched_values['StANProdukte']:
#log.debug("Get Groups from database")
groups = self.get_all_groups()
#else:
# if 'StANThemen' in fetched_values:
# groups = self.translate_group(fetched_values['StANThemen'])
fetched_values['Kategorie'] = {}
fetched_values['Kategorie']['NameDerKategorie'] = []
if groups:
fetched_values['Kategorie']['NameDerKategorie'] = groups
#log.debug(fetched_values['Kategorie']['NameDerKategorie'])
return fetched_values
@staticmethod
def get_all_groups():
result_groups = []
groups_in_database = model.Session.query(model.Group.name).filter(model.Group.state == 'active')
for group_in_database in groups_in_database.all():
result_groups.append(group_in_database.name)
return result_groups
class ContentFetchError(Exception):
pass
class ContentNotFoundError(ContentFetchError):
pass
class RemoteResourceError(Exception):
pass
class SearchError(Exception):
pass
{
"title": "Unfälle beim Umgang mit wassergefährdenden Stoffen und bei der Beförderung wassergefährdender Stoffe in Hamburg 2002",
"url": "http://www.statistik-nord.de",
"notes": "",
"author": "Statistisches Amt für Hamburg und Schleswig-Holstein",
"author_email": "info@statistik-nord.de",
"extras": [
{
"key": "identifier",
"value": "StaNord_CMS:50625"
},
{
"key": "issued",
"value": null
},
{
"key": "modified",
"value": null
},
{
"key" : "temporal_start",
"value" : ""
},
{
"key" : "temporal_end",
"value" : ""
},
{
"key" : "theme",
"value" :[]
}
],
"groups": [
{
"name": "tran"
},
{
"name": "envi"
},
{
"name": "ener"
}
],
"license_id":"dl-by-de/2.0",
"tags": [
{
"name": "Umwelt "
}
],
"type": "dataset",
"resources": [
{
"url": "https://www.statistik-nord.de/fileadmin/Dokumente/Statistische_Berichte/verkehr_umwelt_und_energie/Q_I_3_j_H/Q_I_3_j02_H.pdf",
"format": "pdf",
"name": "Unfälle beim Umgang mit wassergefährdenden Stoffen und bei der Beförderung wassergefährdender Stoffe in Hamburg 2002",
"size": 46903,
"license" : "dl-by-de/2.0"
}
]
}
\ No newline at end of file
#!/usr/bin/env python
import json
import unittest
from ckanext.odsh.harvesters import ckan_mapper as mapper
def value_error():
raise ValueError('Config string can not be empty.')
class TestMappingMethodsStatistikamtNord(unittest.TestCase):
# Files to load
with open('/usr/lib/ckan/default/src/ckanext-odsh/ckanext/odsh/tests/test_data.json') as f:
input_data = json.load(f)
with open('/usr/lib/ckan/default/src/ckanext-odsh/ckanext/odsh/tests/result_data.json') as f:
expected_result = json.load(f)
# Loads statistikamt nord specific configurations
config_filter = mapper.sta_amt_nord.config_filter
numbers = mapper.sta_amt_nord.numbers
# Test pyjq mapper with invalid StANKategorie
def test_pyjq_mapper_with_invalid_config_string(self):
# Save the data for generating new input for the test
test_dict = self.input_data
# Arrange
self.config_filter = ""
# Act
# Assert
with self.assertRaises(ValueError):
mapper.pyjq_mapper(self.config_filter, test_dict, self.numbers)
value_error
# Test pyjq mapper with invalid StANKategorie
def test_pyjq_mapper_with_invalid_StANKategorie(self):
# Save the data for generating new input for the test
test_dict = self.input_data
# Arrange
# Set category to a value, that can not be mapped
test_dict["StANKategorie"] = unicode(0)
# Set the expected result data correctly
self.expected_result["groups"] = []
# Act
package = mapper.pyjq_mapper(self.config_filter, test_dict, self.numbers)
# Assert
self.assertEqual(package, self.expected_result)
if __name__ == '__main__':
unittest.main()
{"TypDesInhalts": "dokument",
"Quelle": "StaNord_CMS",
"AbdeckungAlsTextAusOGDD": "",
"AlternativtextFuerDasBild": "",
"StANKategorie": "83",
"Beschreibung": "",
"Beziehungsverknuepfungen": "",
"OfflineDatum": "",
"ArtDerFormAusOGDD": "",
"Nummern": {"Nummer": ["Q I 3 - j/02 H"]},
"DokumentID": "50625",
"Ansprechpartner": "",
"StANProdukte": ["106"],
"E-Aktenzeichen": "",
"ZeitlicheGranularitaetEinheit": "",
"VeroeffentlichendeStelle": {"Name": "Statistisches Amt für Hamburg und Schleswig-Holstein",
"EMailAdresse": "info@statistik-nord.de"},
"Bild": "",
"WeitereInformationen": {"URL": "http://www.statistik-nord.de"},
"AbdeckungInKoordinatenAusOGDD": "",
"Schlagwoerter": {"Schlagwort": ["Umwelt ",
"Unfälle beim Umgang mit wassergefährdenden Stoffen und bei der Beförderung wassergefährdender Stoffe in Hamburg"]},
"ZeitraumBis": "",
"Volltext": "",
"Namensnennung": "Freie und Hansestadt Hamburg, Statistisches Amt für Hamburg und Schleswig-Holstein",
"Titel": "Unfälle beim Umgang mit wassergefährdenden Stoffen und bei der Beförderung wassergefährdender Stoffe in Hamburg 2002",
"Infogegenstand": {"Bezeichnung": ["statistiken_IG7"]},
"StANThemen": ["83"],
"Papieraktenzeichen": "",
"ZeitraumVon": "",
"MetadatenModellVersionsnummer": "",
"Ressourcen": {"Ressource":
[{"Dateigroesse": "0.046903610229492",
"DateinameInDerLieferung": "Q_I_3_j02_H.pdf",
"URLZumDownload": "https://www.statistik-nord.de/fileadmin/Dokumente/Statistische_Berichte/verkehr_umwelt_und_energie/Q_I_3_j_H/Q_I_3_j02_H.pdf",
"Ressourcenname": "Unfälle beim Umgang mit wassergefährdenden Stoffen und bei der Beförderung wassergefährdender Stoffe in Hamburg 2002",
"Format": {"FormatTyp": "pdf"}}]},
"Nutzungsbestimmungen": {"ID_derLizenz": ["dl-de-by-2.0"]},
"ZeitlicheGranularitaetIntervallDerErhebung": ""}
......@@ -2,3 +2,4 @@ ckan
ckanext-harvest
ckanext-spatial
lxml
pyjq
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment