Skip to content
Snippets Groups Projects
Commit a4b82cdf authored by anonymous's avatar anonymous
Browse files

ODPSH-16: statistiknordharvester.py calls ckan_mapper

parent 66104383
No related branches found
No related tags found
No related merge requests found
import urllib
import urllib2 import urllib2
import httplib
import datetime
import socket
import traceback import traceback
from ckanext.odsh.harvesters.ckan_mapper import pyjq_mapper
from lxml import etree from lxml import etree
import uuid import uuid
#import json
from ckan import model, logic from ckan import model
from ckan.logic import ValidationError, NotFound, get_action from ckan.logic import get_action
from ckan.lib.helpers import json from ckan.lib.helpers import json
from ckan.lib.munge import munge_name
from ckan.lib.navl.validators import not_empty
from ckan.plugins import toolkit from ckan.plugins import toolkit
from ckanext.harvest.model import HarvestObject, HarvestJob from ckanext.harvest.model import HarvestObject
from ckanext.harvest.harvesters.base import HarvesterBase from ckanext.harvest.harvesters.base import HarvesterBase
from ckanext.odsh.model.statistiknord import * from ckanext.odsh.model.statistiknord import *
import logging import logging
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
...@@ -44,7 +41,7 @@ class StatistikNordHarvester(HarvesterBase): ...@@ -44,7 +41,7 @@ class StatistikNordHarvester(HarvesterBase):
documents_list = self.get_documents_from_content(fetched_documents) documents_list = self.get_documents_from_content(fetched_documents)
documents = documents_list['RegistereintragsListe'] documents = documents_list['RegistereintragsListe']
except Exception, e: except Exception, e:
log.error('traceback while reading model: %s' % traceback.format_exc()) #log.error('traceback while reading model: %s' % traceback.format_exc())
self._save_gather_error('Statistik-Nord-Harvester: Error while reading model [%r]' % e, harvest_job) self._save_gather_error('Statistik-Nord-Harvester: Error while reading model [%r]' % e, harvest_job)
return False return False
...@@ -55,7 +52,7 @@ class StatistikNordHarvester(HarvesterBase): ...@@ -55,7 +52,7 @@ class StatistikNordHarvester(HarvesterBase):
try: try:
fetched_values = self.get_values_from_content(document) fetched_values = self.get_values_from_content(document)
identifier = self._create_inforeg_id(fetched_values) identifier = self._create_inforeg_id(fetched_values)
log.info('identifier: %s' % identifier) #log.info('identifier: %s' % identifier)
if identifier in used_identifiers: if identifier in used_identifiers:
continue continue
...@@ -91,8 +88,8 @@ class StatistikNordHarvester(HarvesterBase): ...@@ -91,8 +88,8 @@ class StatistikNordHarvester(HarvesterBase):
if len(ids) > 0: if len(ids) > 0:
log.info( log.info(
"finished %s IDs of %s IDs successfully gathered" % (len(used_identifiers), len(documents))) "finished %s IDs of %s IDs successfully gathered" % (len(used_identifiers), len(documents)))
log.debug("List of gathered IDs: %s" % ids) #log.debug("List of gathered IDs: %s" % ids)
log.debug("gather_stage() finished: %s IDs gathered" % len(ids)) #log.debug("gather_stage() finished: %s IDs gathered" % len(ids))
return ids return ids
else: else:
log.error("No records received") log.error("No records received")
...@@ -109,6 +106,7 @@ class StatistikNordHarvester(HarvesterBase): ...@@ -109,6 +106,7 @@ class StatistikNordHarvester(HarvesterBase):
'session': model.Session, 'session': model.Session,
'user': self._get_user_name(), 'user': self._get_user_name(),
} }
#log.debug("user: " + self._get_user_name()) #log.debug("user: " + self._get_user_name())
if not harvest_object: if not harvest_object:
log.error('Statistik-Nord-Harvester: No harvest object received') log.error('Statistik-Nord-Harvester: No harvest object received')
...@@ -118,61 +116,21 @@ class StatistikNordHarvester(HarvesterBase): ...@@ -118,61 +116,21 @@ class StatistikNordHarvester(HarvesterBase):
self._save_object_error('Empty content for object %s' % harvest_object.id, harvest_object, u'Import') self._save_object_error('Empty content for object %s' % harvest_object.id, harvest_object, u'Import')
return False return False
else: else:
self.dcat_mapper(context, harvest_object)
return True
# A mapper method that maps the content of the harvested object onto the CKAN dataset fields
def dcat_mapper(self, context, harvest_object):
values = json.loads(harvest_object.content) values = json.loads(harvest_object.content)
package_dict = dict() # use the pyjq lib for the default field mapping
package_dict.update({'resources': [], 'tags': [], 'groups':[]}) package = pyjq_mapper(values)
package_dict.update({'title': values['Titel']})
package_dict.update({'notes': values['Beschreibung']})
package_dict.update({'license_id': values['Nutzungsbestimmungen']['ID_derLizenz'][0]})
package_dict.update({'author': values["VeroeffentlichendeStelle"]["Name"]})
package_dict.update({'author_email': values["VeroeffentlichendeStelle"]["EMailAdresse"]})
extras = list()
extras.append({'key': 'identifier', 'value': self._create_inforeg_id(values)})
package_dict['extras'] = extras
if values['Ansprechpartner']:
package_dict.update({'maintainer': values['Ansprechpartner']['Name'],
'maintainer_email': values['Ansprechpartner']['EMailAdresse']})
try:
package_dict['url'] = values['WeitereInformationen']['URL']
except KeyError:
package_dict['url'] = ""
package_dict.update({'type': 'dataset'})
resources = values['Ressourcen']['Ressource']
for resource in resources:
resource_dict = dict()
resource_dict['name'] = resource['Ressourcenname']
resource_dict['format'] = resource['Format'].get('FormatTyp', "")
resource_dict['url'] = resource['URLZumDownload']
if resource['Dateigroesse'] == "0" or len(resource['Dateigroesse']) == 0:
resource_file = urllib2.urlopen(resource['url'])
resource_dict['file_size'] = resource_file['Content-Length']
else:
file_size = int(round(float(resource['Dateigroesse']) * 1000000))
resource_dict['file_size'] = file_size
package_dict['resources'].append(resource_dict)
tags = values['Schlagwoerter']['Schlagwort']
for tag in tags:
seperated_tags = tag.split(',')
for seperated_tag in seperated_tags:
if seperated_tag != '' and len(seperated_tag) < 100:
package_dict['tags'].append({'name': seperated_tag.strip()})
self.map_to_group(package_dict, values)
# add some meta data that is not part of the harvested_object
source_dataset = get_action('package_show')(context.copy(), {'id': harvest_object.source.id}) source_dataset = get_action('package_show')(context.copy(), {'id': harvest_object.source.id})
package_dict['owner_org'] = source_dataset.get('owner_org') package['owner_org'] = source_dataset.get('owner_org')
package['id'] = str(uuid.uuid4())
package_dict['id'] = str(uuid.uuid4()) package_dict = dict(package)
#log.debug(json.dumps(package_dict))
try: try:
result = self._create_or_update_package(package_dict, harvest_object, package_dict_form='package_show') result = self._create_or_update_package(package_dict, harvest_object, package_dict_form='package_show')
return result return result
...@@ -180,25 +138,10 @@ class StatistikNordHarvester(HarvesterBase): ...@@ -180,25 +138,10 @@ class StatistikNordHarvester(HarvesterBase):
self._save_object_error('Validation Error: %s' % str(e.error_summary), harvest_object, 'Import') self._save_object_error('Validation Error: %s' % str(e.error_summary), harvest_object, 'Import')
return False return False
def map_to_group(self, package_dict, values):
# open file with the mapping from numbers to DCAT-DE vocabulary:
with open('/usr/lib/ckan/default/src/ckanext-odsh/ckanext/odsh/harvesters/number_dcat_de.json') as f:
dcat_theme = json.load(f)
# get the code
code = values['StANKategorie']
# if possible map it to a group
if dcat_theme.has_key(str(code)):
for item in dcat_theme[str(code)]:
package_dict['groups'].append({'name': item})
log.debug("DEBUG: DCAT-DE Code Mapping from %s to %s", str(code), item)
else:
# no valid group found.
package_dict['groups'].append({'name': "na"})
@staticmethod @staticmethod
def _get_content(url): def _get_content(url):
url = url.replace(' ', '%20') url = url.replace(' ', '%20')
log.debug("get_content StatistikNord harvester: %s" % url) #log.debug("get_content StatistikNord harvester: %s" % url)
try: try:
http_response = urllib2.urlopen(url, timeout=100000) http_response = urllib2.urlopen(url, timeout=100000)
content = http_response.read() content = http_response.read()
...@@ -212,9 +155,7 @@ class StatistikNordHarvester(HarvesterBase): ...@@ -212,9 +155,7 @@ class StatistikNordHarvester(HarvesterBase):
def get_documents_from_content(content): def get_documents_from_content(content):
fetched_xml = etree.fromstring(content) fetched_xml = etree.fromstring(content)
fetched_string = etree.tostring(fetched_xml) fetched_string = etree.tostring(fetched_xml)
fetched_document = StatistikNordDocuments(fetched_string) fetched_document = StatistikNordDocuments(fetched_string)
fetched_values = fetched_document.read_values() fetched_values = fetched_document.read_values()
return fetched_values return fetched_values
...@@ -236,22 +177,6 @@ class StatistikNordHarvester(HarvesterBase): ...@@ -236,22 +177,6 @@ class StatistikNordHarvester(HarvesterBase):
else: else:
return quelle + ':' + guid.strip() return quelle + ':' + guid.strip()
def add_groups_to_fetched_values(self, fetched_values):
groups = []
if 'StANProdukte' in fetched_values and '4' in fetched_values['StANProdukte']:
log.debug("Get Groups from database")
groups = self.get_all_groups()
#else:
# if 'StANThemen' in fetched_values:
# groups = self.translate_group(fetched_values['StANThemen'])
fetched_values['Kategorie'] = {}
fetched_values['Kategorie']['NameDerKategorie'] = []
if groups:
fetched_values['Kategorie']['NameDerKategorie'] = groups
log.debug(fetched_values['Kategorie']['NameDerKategorie'])
return fetched_values
@staticmethod @staticmethod
def get_all_groups(): def get_all_groups():
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment