diff --git a/ckanext/odsh/harvesters/__init__.py b/ckanext/odsh/harvesters/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..8ac43649f8473497c58ebedd68df339215494d44 --- /dev/null +++ b/ckanext/odsh/harvesters/__init__.py @@ -0,0 +1,9 @@ +# this is a namespace package +try: + import pkg_resources + pkg_resources.declare_namespace(__name__) +except ImportError: + import pkgutil + __path__ = pkgutil.extend_path(__path__, __name__) + +from ckanext.odsh.harvesters.statistiknordharvester import StatistikNordHarvester diff --git a/ckanext/odsh/harvesters/statistiknordharvester.py b/ckanext/odsh/harvesters/statistiknordharvester.py new file mode 100755 index 0000000000000000000000000000000000000000..6b01661b312e21d26ca63fdf8d926fd021fdf453 --- /dev/null +++ b/ckanext/odsh/harvesters/statistiknordharvester.py @@ -0,0 +1,252 @@ +import urllib +import urllib2 +import httplib +import datetime +import socket +import traceback +from lxml import etree +import uuid + +from ckan import model, logic +from ckan.logic import ValidationError, NotFound, get_action +from ckan.lib.helpers import json +from ckan.lib.munge import munge_name +from ckan.lib.navl.validators import not_empty +from ckan.plugins import toolkit + +from ckanext.harvest.model import HarvestObject, HarvestJob +from ckanext.harvest.harvesters.base import HarvesterBase + +from ckanext.odsh.model.statistiknord import * +import logging +log = logging.getLogger(__name__) + + +class StatistikNordHarvester(HarvesterBase): + ''' + A Harvester for Statistikamt Nord + ''' + + def info(self): + return { + 'name': 'statistik-nord', + 'title': 'Statistik Nord', + 'description': 'Harvests Statistikamt Nord', + 'form_config_interface': 'Text' + } + + def gather_stage(self, harvest_job): + url = harvest_job.source.url + + try: + fetched_documents = self._get_content(url) + documents_list = self.get_documents_from_content(fetched_documents) + documents = documents_list['RegistereintragsListe'] + except Exception, e: + log.error('traceback while reading model: %s' % traceback.format_exc()) + self._save_gather_error('Statistik-Nord-Harvester: Error while reading model [%r]' % e, harvest_job) + return False + + try: + used_identifiers = [] + ids = [] + for document in documents: + try: + fetched_values = self.get_values_from_content(document) + identifier = self._create_inforeg_id(fetched_values) + log.info('identifier: %s' % identifier) + + if identifier in used_identifiers: + continue + + if identifier is None: + log.error("ID: unknown - gather process failed ") + continue + + if identifier: + obj = HarvestObject(guid=identifier, + job=harvest_job) + obj.content = json.dumps(fetched_values) + obj.save() + log.info( + "harvest_object_id: %s, GUID: %s successfully gathered " % (str(obj.id), str(obj.guid))) + used_identifiers.append(identifier) + ids.append(obj.id) + log.debug('Save identifier %s from Statistik Nord' % identifier) + + except Exception, e: + log.error('traceback: %s' % traceback.format_exc()) + self._save_gather_error( + 'Statistik-Nord-Harvester: Error for the identifier %s [%r]' % (identifier, e), harvest_job) + continue + + except Exception, e: + self._save_gather_error( + 'Statistik-Nord-Harvester: Error gathering the identifiers from the source server [%s]' % str(e), + harvest_job) + log.error(e) + return None + + if len(ids) > 0: + log.info( + "finished %s IDs of %s IDs successfully gathered xxx" % (len(used_identifiers), len(documents))) + log.debug("List of gathered IDs: %s" % ids) + log.debug("gather_stage() finished: %s IDs gathered" % len(ids)) + return ids + else: + log.error("No records received") + self._save_gather_error("Couldn't find any metadata files %s" % + harvest_job) + return None + + @staticmethod + def fetch_stage(harvest_object): + return True + + def import_stage(self, harvest_object): + context = { + 'model': model, + 'session': model.Session, + 'user': self._get_user_name(), + } + log.debug("user: " + self._get_user_name()) + if not harvest_object: + log.error('Statistik-Nord-Harvester: No harvest object received') + return False + + if harvest_object.content is None: + self._save_object_error('Empty content for object %s' % harvest_object.id, harvest_object, u'Import') + return False + else: + values = json.loads(harvest_object.content) + package_dict = dict() + package_dict.update({'resources': [], 'tags': []}) + package_dict.update({'title': values['Titel']}) + package_dict.update({'notes': values['Beschreibung']}) + package_dict.update({'license_id': values['Nutzungsbestimmungen']['ID_derLizenz']}) + package_dict.update({'author': values["VeroeffentlichendeStelle"]["Name"]}) + package_dict.update({'author_email': values["VeroeffentlichendeStelle"]["EMailAdresse"]}) + + extras = dict() + extras.update({'metadata_original_id': self._create_inforeg_id(values)}) + + if values['Ansprechpartner']: + package_dict.update({'maintainer': values['Ansprechpartner']['Name'], + 'maintainer_email': values['Ansprechpartner']['EMailAdresse']}) + try: + package_dict['url'] = values['WeitereInformationen']['URLZumDownload'] + except KeyError: + package_dict['url'] = "" + + package_dict.update({'type': 'dataset'}) + + resources = values['Ressourcen']['Ressource'] + + for resource in resources: + resource_dict = dict() + resource_dict['name'] = resource['Ressourcenname'] + resource_dict['format'] = resource['Format'].get('FormatTyp', "") + resource_dict['url'] = resource['URLZumDownload'] + if resource['Dateigroesse'] == "0" or len(resource['Dateigroesse']) == 0: + resource_file = urllib2.urlopen(resource['url']) + resource_dict['file_size'] = resource_file['Content-Length'] + else: + file_size = int(round(float(resource['Dateigroesse']) * 1000000)) + resource_dict['file_size'] = file_size + package_dict['resources'].append(resource_dict) + + source_dataset = get_action('package_show')(context.copy(), {'id': harvest_object.source.id}) + package_dict['owner_org'] = source_dataset.get('owner_org') + + package_dict['id'] = str(uuid.uuid4()) + + log.debug(json.dumps(package_dict)) + try: + result = self._create_or_update_package(package_dict, harvest_object, package_dict_form='package_show') + return result + except toolkit.ValidationError, e: + self._save_object_error('Validation Error: %s' % str(e.error_summary), harvest_object, 'Import') + return False + + @staticmethod + def _get_content(url): + url = url.replace(' ', '%20') + log.debug("get_content StatistikNord harvester: %s" % url) + try: + http_response = urllib2.urlopen(url, timeout=100000) + content = http_response.read() + return content + except Exception, e: + log.error('traceback WebHarvester could not get content!: %s' % traceback.format_exc()) + log.debug("Error in _get_content %s" % e) + raise e + + @staticmethod + def get_documents_from_content(content): + fetched_xml = etree.fromstring(content) + fetched_string = etree.tostring(fetched_xml) + + fetched_document = StatistikNordDocuments(fetched_string) + + fetched_values = fetched_document.read_values() + return fetched_values + + @staticmethod + def get_values_from_content(content): + fetched_xml = etree.fromstring(content) + fetched_string = etree.tostring(fetched_xml) + fetched_document = StatistikNordDocument(fetched_string) + fetched_values = fetched_document.read_values() + + return fetched_values + + @staticmethod + def _create_inforeg_id(values): + guid = values['DokumentID'] + quelle = values['Quelle'] + if guid.startswith(quelle): + return guid.strip() + else: + return quelle + ':' + guid.strip() + + def add_groups_to_fetched_values(self, fetched_values): + groups = [] + if 'StANProdukte' in fetched_values and '4' in fetched_values['StANProdukte']: + log.debug("Get Groups from database") + groups = self.get_all_groups() + #else: + # if 'StANThemen' in fetched_values: + # groups = self.translate_group(fetched_values['StANThemen']) + + fetched_values['Kategorie'] = {} + fetched_values['Kategorie']['NameDerKategorie'] = [] + if groups: + fetched_values['Kategorie']['NameDerKategorie'] = groups + + log.debug(fetched_values['Kategorie']['NameDerKategorie']) + return fetched_values + + @staticmethod + def get_all_groups(): + result_groups = [] + groups_in_database = Session.query(model.Group.name).filter(model.Group.state == 'active') + for group_in_database in groups_in_database.all(): + result_groups.append(group_in_database.name) + + return result_groups + + +class ContentFetchError(Exception): + pass + + +class ContentNotFoundError(ContentFetchError): + pass + + +class RemoteResourceError(Exception): + pass + + +class SearchError(Exception): + pass diff --git a/ckanext/odsh/model/__init__.py b/ckanext/odsh/model/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..058d8354551dc79f6936d4822291ecd6db67b620 --- /dev/null +++ b/ckanext/odsh/model/__init__.py @@ -0,0 +1 @@ +from ckanext.odsh.model.statistiknord import * diff --git a/ckanext/odsh/model/statistiknord.py b/ckanext/odsh/model/statistiknord.py new file mode 100755 index 0000000000000000000000000000000000000000..c39a1d2686b5990be45565a71d54cfdc515962b9 --- /dev/null +++ b/ckanext/odsh/model/statistiknord.py @@ -0,0 +1,441 @@ +# -*- coding: utf-8 -*- +from ckanext.spatial.model.harvested_metadata import MappedXmlDocument, MappedXmlElement + + +class StatistikNordDocumentElement(MappedXmlElement): + pass + + +class StatistikNordDocumentRelationship(MappedXmlElement): + elements = [ + StatistikNordDocumentElement( + name="Typ", + search_paths=[ + "Typ/text()", + ], + multiplicity="1", + ), + StatistikNordDocumentElement( + name="ID", + search_paths=[ + "ID/text()", + ], + multiplicity="*", + ) + ] + + +class StatistikNordDocumentResource(MappedXmlElement): + elements = [ + StatistikNordDocumentElement( + name="URLZumDownload", + search_paths=[ + "URLZumDownload/text()", + ], + multiplicity="0..1", + ), + StatistikNordDocumentElement( + name="DateinameInDerLieferung", + search_paths=[ + "DateinameInDerLieferung/text()", + ], + multiplicity="0..1", + ), + StatistikNordDocumentElement( + name="Ressourcenname", + search_paths=[ + "Ressourcenname/text()", + ], + multiplicity="1", + ), + StatistikNordDocumentElement( + name="Format", + search_paths=[ + "Format", + ], + multiplicity="1", + elements=[ + StatistikNordDocumentElement( + name="FormatTyp", + search_paths=[ + "FormatTyp/text()", + ], + multiplicity="1" + ), + ] + ), + StatistikNordDocumentElement( + name="Dateigroesse", + search_paths=[ + "Dateigroesse/text()", + ], + multiplicity="0..1", + ) + ] + + +class StatistikNordDocument(MappedXmlDocument): + elements = [ + StatistikNordDocumentElement( + name="Titel", + search_paths=[ + "Titel/text()", + ], + multiplicity="1", + ), + StatistikNordDocumentElement( + name="Beschreibung", + search_paths=[ + "Beschreibung/text()", + ], + multiplicity="0..1", + ), + StatistikNordDocumentElement( + name="Schlagwoerter", + search_paths=[ + "Schlagwoerter", + ], + multiplicity="0..1", + elements=[ + StatistikNordDocumentElement( + name="Schlagwort", + search_paths=[ + "Schlagwort/text()", + ], + multiplicity="*" + ), + ] + ), + StatistikNordDocumentElement( + name="Nutzungsbestimmungen", + search_paths=[ + "Nutzungsbestimmungen", + ], + multiplicity="0..1", + elements=[ + StatistikNordDocumentElement( + name="ID_derLizenz", + search_paths=[ + "ID_derLizenz/text()", + ], + multiplicity="*" + ), + ] + ), + + StatistikNordDocumentElement( + name="Namensnennung", + search_paths=[ + "Namensnennung/text()", + ], + multiplicity="0..1", + ), + StatistikNordDocumentElement( + name="VeroeffentlichendeStelle", + search_paths=[ + "VeroeffentlichendeStelle", + ], + multiplicity="1", + elements=[ + StatistikNordDocumentElement( + name="Name", + search_paths=[ + "Name/text()", + ], + multiplicity="1" + ), + StatistikNordDocumentElement( + name="EMailAdresse", + search_paths=[ + "EMailAdresse/text()", + ], + multiplicity="0..1" + ) + ] + ), + StatistikNordDocumentElement( + name="Ansprechpartner", + search_paths=[ + "Ansprechpartner", + ], + multiplicity="0..1", + elements=[ + StatistikNordDocumentElement( + name="Name", + search_paths=[ + "Name/text()", + ], + multiplicity="0..1" + ), + StatistikNordDocumentElement( + name="EMailAdresse", + search_paths=[ + "EMailAdresse/text()", + ], + multiplicity="0..1" + ) + ] + ), + StatistikNordDocumentElement( + name="WeitereInformationen", + search_paths=[ + "WeitereInformationen", + ], + multiplicity="0..1", + elements=[ + StatistikNordDocumentElement( + name="URL", + search_paths=[ + "URLZumDownload/text()", + ], + multiplicity="1" + ), + ] + ), + StatistikNordDocumentElement( + name="TypDesInhalts", + search_paths=[ + "TypDesInhalts/text()", + ], + multiplicity="1", + ), + + StatistikNordDocumentElement( + name="Ressourcen", + search_paths=[ + "Ressourcen", + ], + multiplicity="1", + elements=[ + StatistikNordDocumentResource( + name="Ressource", + search_paths=[ + "Ressource", + ], + multiplicity="*" + ), + ] + ), + + StatistikNordDocumentElement( + name="ZeitraumVon", + search_paths=[ + "ZeitraumVon/text()", + ], + multiplicity="0..1", + ), + StatistikNordDocumentElement( + name="ZeitraumBis", + search_paths=[ + "ZeitraumBis/text()", + ], + multiplicity="0..1", + ), + StatistikNordDocumentElement( + name="ZeitlicheGranularitaetIntervallDerErhebung", + search_paths=[ + "ZeitlicheGranularitaetIntervallDerErhebung/text()", + ], + multiplicity="0..1", + ), + StatistikNordDocumentElement( + name="ZeitlicheGranularitaetEinheit", + search_paths=[ + "ZeitlicheGranularitaetEinheit/text()", + ], + multiplicity="0..1", + ), + StatistikNordDocumentElement( + name="DokumentID", + search_paths=[ + "DokumentID/text()", + ], + multiplicity="1", + ), + StatistikNordDocumentElement( + name="AbdeckungInKoordinatenAusOGDD", + search_paths=[ + "AbdeckungInKoordinatenAusOGDD", + ], + multiplicity="0..1", + elements=[ + StatistikNordDocumentElement( + name="Koordinaten", + search_paths=[ + "Koordinaten/text()", + ], + multiplicity="*" + ), + ] + ), + StatistikNordDocumentElement( + name="ArtDerFormAusOGDD", + search_paths=[ + "ArtDerFormAusOGDD", + ], + multiplicity="0..1", + elements=[ + StatistikNordDocumentElement( + name="Koordinaten", + search_paths=[ + "Koordinaten/text()", + ], + multiplicity="*" + ), + ] + ), + StatistikNordDocumentElement( + name="AbdeckungAlsTextAusOGDD", + search_paths=[ + "AbdeckungAlsTextAusOGDD/text()", + ], + multiplicity="0..1", + ), + StatistikNordDocumentElement( + name="Bild", + search_paths=[ + "Bild/text()", + ], + multiplicity="0..1", + ), + StatistikNordDocumentElement( + name="Infogegenstand", + search_paths=[ + "Infogegenstand", + ], + multiplicity="1", + elements=[ + StatistikNordDocumentElement( + name="Bezeichnung", + search_paths=[ + "Bezeichnung/text()", + ], + multiplicity="*" + ), + ] + ), + StatistikNordDocumentElement( + name="E-Aktenzeichen", + search_paths=[ + "E-Aktenzeichen/text()", + ], + multiplicity="0..1", + ), + StatistikNordDocumentElement( + name="Papieraktenzeichen", + search_paths=[ + "Papieraktenzeichen/text()", + ], + multiplicity="0..1", + ), + StatistikNordDocumentElement( + name="OfflineDatum", + search_paths=[ + "OfflineDatum/text()", + ], + multiplicity="0..1", + ), + StatistikNordDocumentElement( + name="Nummern", + search_paths=[ + "Nummern", + ], + multiplicity="0..1", + elements=[ + StatistikNordDocumentElement( + name="Nummer", + search_paths=[ + "Nummer/text()", + ], + multiplicity="*" + ), + ] + ), + StatistikNordDocumentElement( + name="Quelle", + search_paths=[ + "Quelle/text()", + ], + multiplicity="1", + ), + StatistikNordDocumentElement( + name="Volltext", + search_paths=[ + "Volltext/text()", + ], + multiplicity="0..1", + ), + StatistikNordDocumentElement( + name="AlternativtextFuerDasBild", + search_paths=[ + "AlternativtextFuerDasBild/text()", + ], + multiplicity="0..1", + ), + StatistikNordDocumentElement( + name="Beziehungsverknuepfungen", + search_paths=[ + "Beziehungsverknuepfungen", + ], + multiplicity="0..1", + elements=[ + StatistikNordDocumentRelationship( + name="Beziehungsverknuepfung", + search_paths=[ + "Beziehungsverknuepfung", + ], + multiplicity="*" + ), + ] + ), + StatistikNordDocumentElement( + name="MetadatenModellVersionsnummer", + search_paths=[ + "MetadatenModellVersionsnummer", + ], + multiplicity="0..1", + elements=[ + StatistikNordDocumentElement( + name="Versionsnummer", + search_paths=[ + "Versionsnummer/text()", + ], + multiplicity="1" + ), + ] + ), + StatistikNordDocumentElement( + name="StANKategorie", + search_paths=[ + "StANKategorie/text()", + ], + multiplicity="1", + ), + StatistikNordDocumentElement( + name="StANThemen", + search_paths=[ + "StANThemen/StANThemaID/text()", + ], + multiplicity="*", + + ), + StatistikNordDocumentElement( + name="StANProdukte", + search_paths=[ + "StANProdukte/StANProductID/text()", + ], + multiplicity="*", + ) + ] + + +class StatistikNordDocuments(MappedXmlDocument): + elements = [ + StatistikNordDocumentElement( + name="RegistereintragsListe", + search_paths=[ + "RegistereintragAusLiefersystem", + ], + multiplicity="*" + ) + ] diff --git a/requirements.txt b/requirements.txt old mode 100644 new mode 100755 index f8390e8d2d6077367e0963bffbcba9da825db18c..6042e0e84bc674a8af3af2141a129f626d75a1c8 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,4 @@ +ckan ckanext-harvest ckanext-spatial +lxml \ No newline at end of file diff --git a/setup.py b/setup.py index 3fccf3cea938e492d874befad1d3c67a73fdf6e0..8f6b6e3780f64b124a0c39085d99aebdcfb4b014 100644 --- a/setup.py +++ b/setup.py @@ -81,6 +81,7 @@ setup( entry_points=''' [ckan.plugins] odsh=ckanext.odsh.plugin:OdshPlugin + statistiknord_harvester=ckanext.odsh.harvesters:StatistikNordHarvester [babel.extractors] ckan = ckan.lib.extract:extract_ckan