working statistik-nord harvester

b7ee53f6 · anonymous · 9528e2a8 · b7ee53f6 · b7ee53f6 · b7ee53f6
Commit b7ee53f6 authored 6 years ago by anonymous
--- a/ckanext/odsh/harvesters/__init__.py
+++ b/ckanext/odsh/harvesters/__init__.py
+# this is a namespace package
+try:
+    import pkg_resources
+    pkg_resources.declare_namespace(__name__)
+except ImportError:
+    import pkgutil
+    __path__ = pkgutil.extend_path(__path__, __name__)
+from ckanext.odsh.harvesters.statistiknordharvester import StatistikNordHarvester
--- a/ckanext/odsh/harvesters/statistiknordharvester.py
+++ b/ckanext/odsh/harvesters/statistiknordharvester.py
+import urllib
+import urllib2
+import httplib
+import datetime
+import socket
+import traceback
+from lxml import etree
+import uuid
+from ckan import model, logic
+from ckan.logic import ValidationError, NotFound, get_action
+from ckan.lib.helpers import json
+from ckan.lib.munge import munge_name
+from ckan.lib.navl.validators import not_empty
+from ckan.plugins import toolkit
+from ckanext.harvest.model import HarvestObject, HarvestJob
+from ckanext.harvest.harvesters.base import HarvesterBase
+from ckanext.odsh.model.statistiknord import *
+import logging
+log = logging.getLogger(__name__)
+class StatistikNordHarvester(HarvesterBase):
+    '''
+    A Harvester for Statistikamt Nord
+    '''
+    def info(self):
+        return {
+            'name': 'statistik-nord',
+            'title': 'Statistik Nord',
+            'description': 'Harvests Statistikamt Nord',
+            'form_config_interface': 'Text'
+        }
+    def gather_stage(self, harvest_job):
+        url = harvest_job.source.url
+        try:
+            fetched_documents = self._get_content(url)
+            documents_list = self.get_documents_from_content(fetched_documents)
+            documents = documents_list['RegistereintragsListe']
+        except Exception, e:
+            log.error('traceback while reading model: %s' % traceback.format_exc())
+            self._save_gather_error('Statistik-Nord-Harvester: Error while reading model [%r]' % e, harvest_job)
+            return False
+        try:
+            used_identifiers = []
+            ids = []
+            for document in documents:
+                try:
+                    fetched_values = self.get_values_from_content(document)
+                    identifier = self._create_inforeg_id(fetched_values)
+                    log.info('identifier: %s' % identifier)
+                    if identifier in used_identifiers:
+                        continue
+                    if identifier is None:
+                        log.error("ID: unknown - gather process failed ")
+                        continue
+                    if identifier:
+                        obj = HarvestObject(guid=identifier,
+                                            job=harvest_job)
+                        obj.content = json.dumps(fetched_values)
+                        obj.save()
+                        log.info(
+                            "harvest_object_id: %s, GUID: %s successfully gathered " % (str(obj.id), str(obj.guid)))
+                        used_identifiers.append(identifier)
+                        ids.append(obj.id)
+                        log.debug('Save identifier %s from Statistik Nord' % identifier)
+                except Exception, e:
+                    log.error('traceback: %s' % traceback.format_exc())
+                    self._save_gather_error(
+                        'Statistik-Nord-Harvester: Error for the identifier %s [%r]' % (identifier, e), harvest_job)
+                    continue
+        except Exception, e:
+            self._save_gather_error(
+                'Statistik-Nord-Harvester: Error gathering the identifiers from the source server [%s]' % str(e),
+                harvest_job)
+            log.error(e)
+            return None
+        if len(ids) > 0:
+            log.info(
+                "finished %s IDs of %s IDs successfully gathered xxx" % (len(used_identifiers), len(documents)))
+            log.debug("List of gathered IDs: %s" % ids)
+            log.debug("gather_stage() finished: %s IDs gathered" % len(ids))
+            return ids
+        else:
+            log.error("No records received")
+            self._save_gather_error("Couldn't find any metadata files %s" %
+                                    harvest_job)
+            return None
+    @staticmethod
+    def fetch_stage(harvest_object):
+        return True
+    def import_stage(self, harvest_object):
+        context = {
+            'model': model,
+            'session': model.Session,
+            'user': self._get_user_name(),
+        }
+        log.debug("user: " + self._get_user_name())
+        if not harvest_object:
+            log.error('Statistik-Nord-Harvester: No harvest object received')
+            return False
+        if harvest_object.content is None:
+            self._save_object_error('Empty content for object %s' % harvest_object.id, harvest_object, u'Import')
+            return False
+        else:
+            values = json.loads(harvest_object.content)
+            package_dict = dict()
+            package_dict.update({'resources': [], 'tags': []})
+            package_dict.update({'title': values['Titel']})
+            package_dict.update({'notes': values['Beschreibung']})
+            package_dict.update({'license_id': values['Nutzungsbestimmungen']['ID_derLizenz']})
+            package_dict.update({'author': values["VeroeffentlichendeStelle"]["Name"]})
+            package_dict.update({'author_email': values["VeroeffentlichendeStelle"]["EMailAdresse"]})
+            extras = dict()
+            extras.update({'metadata_original_id': self._create_inforeg_id(values)})
+            if values['Ansprechpartner']:
+                package_dict.update({'maintainer': values['Ansprechpartner']['Name'],
+                                     'maintainer_email': values['Ansprechpartner']['EMailAdresse']})
+            try:
+                package_dict['url'] = values['WeitereInformationen']['URLZumDownload']
+            except KeyError:
+                package_dict['url'] = ""
+            package_dict.update({'type': 'dataset'})
+            resources = values['Ressourcen']['Ressource']
+            for resource in resources:
+                resource_dict = dict()
+                resource_dict['name'] = resource['Ressourcenname']
+                resource_dict['format'] = resource['Format'].get('FormatTyp', "")
+                resource_dict['url'] = resource['URLZumDownload']
+                if resource['Dateigroesse'] == "0" or len(resource['Dateigroesse']) == 0:
+                    resource_file = urllib2.urlopen(resource['url'])
+                    resource_dict['file_size'] = resource_file['Content-Length']
+                else:
+                    file_size = int(round(float(resource['Dateigroesse']) * 1000000))
+                    resource_dict['file_size'] = file_size
+                package_dict['resources'].append(resource_dict)
+            source_dataset = get_action('package_show')(context.copy(), {'id': harvest_object.source.id})
+            package_dict['owner_org'] = source_dataset.get('owner_org')
+            package_dict['id'] = str(uuid.uuid4())
+            log.debug(json.dumps(package_dict))
+            try:
+                result = self._create_or_update_package(package_dict, harvest_object, package_dict_form='package_show')
+                return result
+            except toolkit.ValidationError, e:
+                self._save_object_error('Validation Error: %s' % str(e.error_summary), harvest_object, 'Import')
+                return False
+    @staticmethod
+    def _get_content(url):
+        url = url.replace(' ', '%20')
+        log.debug("get_content StatistikNord harvester: %s" % url)
+        try:
+            http_response = urllib2.urlopen(url, timeout=100000)
+            content = http_response.read()
+            return content
+        except Exception, e:
+            log.error('traceback WebHarvester could not get content!: %s' % traceback.format_exc())
+            log.debug("Error in _get_content %s" % e)
+            raise e
+    @staticmethod
+    def get_documents_from_content(content):
+        fetched_xml = etree.fromstring(content)
+        fetched_string = etree.tostring(fetched_xml)
+        fetched_document = StatistikNordDocuments(fetched_string)
+        fetched_values = fetched_document.read_values()
+        return fetched_values
+    @staticmethod
+    def get_values_from_content(content):
+        fetched_xml = etree.fromstring(content)
+        fetched_string = etree.tostring(fetched_xml)
+        fetched_document = StatistikNordDocument(fetched_string)
+        fetched_values = fetched_document.read_values()
+        return fetched_values
+    @staticmethod
+    def _create_inforeg_id(values):
+        guid = values['DokumentID']
+        quelle = values['Quelle']
+        if guid.startswith(quelle):
+            return guid.strip()
+        else:
+            return quelle + ':' + guid.strip()
+    def add_groups_to_fetched_values(self, fetched_values):
+        groups = []
+        if 'StANProdukte' in fetched_values and '4' in fetched_values['StANProdukte']:
+            log.debug("Get Groups from database")
+            groups = self.get_all_groups()
+        #else:
+         #   if 'StANThemen' in fetched_values:
+          #      groups = self.translate_group(fetched_values['StANThemen'])
+        fetched_values['Kategorie'] = {}
+        fetched_values['Kategorie']['NameDerKategorie'] = []
+        if groups:
+            fetched_values['Kategorie']['NameDerKategorie'] = groups
+        log.debug(fetched_values['Kategorie']['NameDerKategorie'])
+        return fetched_values
+    @staticmethod
+    def get_all_groups():
+        result_groups = []
+        groups_in_database = Session.query(model.Group.name).filter(model.Group.state == 'active')
+        for group_in_database in groups_in_database.all():
+            result_groups.append(group_in_database.name)
+        return result_groups
+class ContentFetchError(Exception):
+    pass
+class ContentNotFoundError(ContentFetchError):
+    pass
+class RemoteResourceError(Exception):
+    pass
+class SearchError(Exception):
+    pass
--- a/ckanext/odsh/model/__init__.py
+++ b/ckanext/odsh/model/__init__.py
+from ckanext.odsh.model.statistiknord import *
--- a/ckanext/odsh/model/statistiknord.py
+++ b/ckanext/odsh/model/statistiknord.py
+# -*- coding: utf-8 -*-
+from ckanext.spatial.model.harvested_metadata import MappedXmlDocument, MappedXmlElement
+class StatistikNordDocumentElement(MappedXmlElement):
+    pass
+class StatistikNordDocumentRelationship(MappedXmlElement):
+    elements = [
+        StatistikNordDocumentElement(
+            name="Typ",
+            search_paths=[
+                "Typ/text()",
+            ],
+            multiplicity="1",
+        ),
+        StatistikNordDocumentElement(
+            name="ID",
+            search_paths=[
+                "ID/text()",
+            ],
+            multiplicity="*",
+        )
+    ]
+class StatistikNordDocumentResource(MappedXmlElement):
+    elements = [
+        StatistikNordDocumentElement(
+            name="URLZumDownload",
+            search_paths=[
+                "URLZumDownload/text()",
+            ],
+            multiplicity="0..1",
+        ),
+        StatistikNordDocumentElement(
+            name="DateinameInDerLieferung",
+            search_paths=[
+                "DateinameInDerLieferung/text()",
+            ],
+            multiplicity="0..1",
+        ),
+        StatistikNordDocumentElement(
+            name="Ressourcenname",
+            search_paths=[
+                "Ressourcenname/text()",
+            ],
+            multiplicity="1",
+        ),
+        StatistikNordDocumentElement(
+            name="Format",
+            search_paths=[
+                "Format",
+            ],
+            multiplicity="1",
+            elements=[
+                StatistikNordDocumentElement(
+                    name="FormatTyp",
+                    search_paths=[
+                        "FormatTyp/text()",
+                    ],
+                    multiplicity="1"
+                ),
+            ]
+        ),
+        StatistikNordDocumentElement(
+            name="Dateigroesse",
+            search_paths=[
+                "Dateigroesse/text()",
+            ],
+            multiplicity="0..1",
+        )
+    ]
+class StatistikNordDocument(MappedXmlDocument):
+    elements = [
+        StatistikNordDocumentElement(
+            name="Titel",
+            search_paths=[
+                "Titel/text()",
+            ],
+            multiplicity="1",
+        ),
+        StatistikNordDocumentElement(
+            name="Beschreibung",
+            search_paths=[
+                "Beschreibung/text()",
+            ],
+            multiplicity="0..1",
+        ),
+        StatistikNordDocumentElement(
+            name="Schlagwoerter",
+            search_paths=[
+                "Schlagwoerter",
+            ],
+            multiplicity="0..1",
+            elements=[
+                StatistikNordDocumentElement(
+                    name="Schlagwort",
+                    search_paths=[
+                        "Schlagwort/text()",
+                    ],
+                    multiplicity="*"
+                ),
+            ]
+        ),
+        StatistikNordDocumentElement(
+            name="Nutzungsbestimmungen",
+            search_paths=[
+                "Nutzungsbestimmungen",
+            ],
+            multiplicity="0..1",
+            elements=[
+                StatistikNordDocumentElement(
+                    name="ID_derLizenz",
+                    search_paths=[
+                        "ID_derLizenz/text()",
+                    ],
+                    multiplicity="*"
+                ),
+            ]
+        ),
+        StatistikNordDocumentElement(
+            name="Namensnennung",
+            search_paths=[
+                "Namensnennung/text()",
+            ],
+            multiplicity="0..1",
+        ),
+        StatistikNordDocumentElement(
+            name="VeroeffentlichendeStelle",
+            search_paths=[
+                "VeroeffentlichendeStelle",
+            ],
+            multiplicity="1",
+            elements=[
+                StatistikNordDocumentElement(
+                    name="Name",
+                    search_paths=[
+                        "Name/text()",
+                    ],
+                    multiplicity="1"
+                ),
+                StatistikNordDocumentElement(
+                    name="EMailAdresse",
+                    search_paths=[
+                        "EMailAdresse/text()",
+                    ],
+                    multiplicity="0..1"
+                )
+            ]
+        ),
+        StatistikNordDocumentElement(
+            name="Ansprechpartner",
+            search_paths=[
+                "Ansprechpartner",
+            ],
+            multiplicity="0..1",
+            elements=[
+                StatistikNordDocumentElement(
+                    name="Name",
+                    search_paths=[
+                        "Name/text()",
+                    ],
+                    multiplicity="0..1"
+                ),
+                StatistikNordDocumentElement(
+                    name="EMailAdresse",
+                    search_paths=[
+                        "EMailAdresse/text()",
+                    ],
+                    multiplicity="0..1"
+                )
+            ]
+        ),
+        StatistikNordDocumentElement(
+            name="WeitereInformationen",
+            search_paths=[
+                "WeitereInformationen",
+            ],
+            multiplicity="0..1",
+            elements=[
+                StatistikNordDocumentElement(
+                    name="URL",
+                    search_paths=[
+                        "URLZumDownload/text()",
+                    ],
+                    multiplicity="1"
+                ),
+            ]
+        ),
+        StatistikNordDocumentElement(
+            name="TypDesInhalts",
+            search_paths=[
+                "TypDesInhalts/text()",
+            ],
+            multiplicity="1",
+        ),
+        StatistikNordDocumentElement(
+            name="Ressourcen",
+            search_paths=[
+                "Ressourcen",
+            ],
+            multiplicity="1",
+            elements=[
+                StatistikNordDocumentResource(
+                    name="Ressource",
+                    search_paths=[
+                        "Ressource",
+                    ],
+                    multiplicity="*"
+                ),
+            ]
+        ),
+        StatistikNordDocumentElement(
+            name="ZeitraumVon",
+            search_paths=[
+                "ZeitraumVon/text()",
+            ],
+            multiplicity="0..1",
+        ),
+        StatistikNordDocumentElement(
+            name="ZeitraumBis",
+            search_paths=[
+                "ZeitraumBis/text()",
+            ],
+            multiplicity="0..1",
+        ),
+        StatistikNordDocumentElement(
+            name="ZeitlicheGranularitaetIntervallDerErhebung",
+            search_paths=[
+                "ZeitlicheGranularitaetIntervallDerErhebung/text()",
+            ],
+            multiplicity="0..1",
+        ),
+        StatistikNordDocumentElement(
+            name="ZeitlicheGranularitaetEinheit",
+            search_paths=[
+                "ZeitlicheGranularitaetEinheit/text()",
+            ],
+            multiplicity="0..1",
+        ),
+        StatistikNordDocumentElement(
+            name="DokumentID",
+            search_paths=[
+                "DokumentID/text()",
+            ],
+            multiplicity="1",
+        ),
+        StatistikNordDocumentElement(
+            name="AbdeckungInKoordinatenAusOGDD",
+            search_paths=[
+                "AbdeckungInKoordinatenAusOGDD",
+            ],
+            multiplicity="0..1",
+            elements=[
+                StatistikNordDocumentElement(
+                    name="Koordinaten",
+                    search_paths=[
+                        "Koordinaten/text()",
+                    ],
+                    multiplicity="*"
+                ),
+            ]
+        ),
+        StatistikNordDocumentElement(
+            name="ArtDerFormAusOGDD",
+            search_paths=[
+                "ArtDerFormAusOGDD",
+            ],
+            multiplicity="0..1",
+            elements=[
+                StatistikNordDocumentElement(
+                    name="Koordinaten",
+                    search_paths=[
+                        "Koordinaten/text()",
+                    ],
+                    multiplicity="*"
+                ),
+            ]
+        ),
+        StatistikNordDocumentElement(
+            name="AbdeckungAlsTextAusOGDD",
+            search_paths=[
+                "AbdeckungAlsTextAusOGDD/text()",
+            ],
+            multiplicity="0..1",
+        ),
+        StatistikNordDocumentElement(
+            name="Bild",
+            search_paths=[
+                "Bild/text()",
+            ],
+            multiplicity="0..1",
+        ),
+        StatistikNordDocumentElement(
+            name="Infogegenstand",
+            search_paths=[
+                "Infogegenstand",
+            ],
+            multiplicity="1",
+            elements=[
+                StatistikNordDocumentElement(
+                    name="Bezeichnung",
+                    search_paths=[
+                        "Bezeichnung/text()",
+                    ],
+                    multiplicity="*"
+                ),
+            ]
+        ),
+        StatistikNordDocumentElement(
+            name="E-Aktenzeichen",
+            search_paths=[
+                "E-Aktenzeichen/text()",
+            ],
+            multiplicity="0..1",
+        ),
+        StatistikNordDocumentElement(
+            name="Papieraktenzeichen",
+            search_paths=[
+                "Papieraktenzeichen/text()",
+            ],
+            multiplicity="0..1",
+        ),
+        StatistikNordDocumentElement(
+            name="OfflineDatum",
+            search_paths=[
+                "OfflineDatum/text()",
+            ],
+            multiplicity="0..1",
+        ),
+        StatistikNordDocumentElement(
+            name="Nummern",
+            search_paths=[
+                "Nummern",
+            ],
+            multiplicity="0..1",
+            elements=[
+                StatistikNordDocumentElement(
+                    name="Nummer",
+                    search_paths=[
+                        "Nummer/text()",
+                    ],
+                    multiplicity="*"
+                ),
+            ]
+        ),
+        StatistikNordDocumentElement(
+            name="Quelle",
+            search_paths=[
+                "Quelle/text()",
+            ],
+            multiplicity="1",
+        ),
+        StatistikNordDocumentElement(
+            name="Volltext",
+            search_paths=[
+                "Volltext/text()",
+            ],
+            multiplicity="0..1",
+        ),
+        StatistikNordDocumentElement(
+            name="AlternativtextFuerDasBild",
+            search_paths=[
+                "AlternativtextFuerDasBild/text()",
+            ],
+            multiplicity="0..1",
+        ),
+        StatistikNordDocumentElement(
+            name="Beziehungsverknuepfungen",
+            search_paths=[
+                "Beziehungsverknuepfungen",
+            ],
+            multiplicity="0..1",
+            elements=[
+                StatistikNordDocumentRelationship(
+                    name="Beziehungsverknuepfung",
+                    search_paths=[
+                        "Beziehungsverknuepfung",
+                    ],
+                    multiplicity="*"
+                ),
+            ]
+        ),
+        StatistikNordDocumentElement(
+            name="MetadatenModellVersionsnummer",
+            search_paths=[
+                "MetadatenModellVersionsnummer",
+            ],
+            multiplicity="0..1",
+            elements=[
+                StatistikNordDocumentElement(
+                    name="Versionsnummer",
+                    search_paths=[
+                        "Versionsnummer/text()",
+                    ],
+                    multiplicity="1"
+                ),
+            ]
+        ),
+        StatistikNordDocumentElement(
+            name="StANKategorie",
+            search_paths=[
+                "StANKategorie/text()",
+            ],
+            multiplicity="1",
+        ),
+        StatistikNordDocumentElement(
+            name="StANThemen",
+            search_paths=[
+                "StANThemen/StANThemaID/text()",
+            ],
+            multiplicity="*",
+        ),
+        StatistikNordDocumentElement(
+            name="StANProdukte",
+            search_paths=[
+                "StANProdukte/StANProductID/text()",
+            ],
+            multiplicity="*",
+        )
+    ]
+class StatistikNordDocuments(MappedXmlDocument):
+    elements = [
+        StatistikNordDocumentElement(
+            name="RegistereintragsListe",
+            search_paths=[
+                "RegistereintragAusLiefersystem",
+            ],
+            multiplicity="*"
+        )
+    ]
--- a/requirements.txt
+++ b/requirements.txt
+ckan
 ckanext-harvest
 ckanext-spatial
+lxml
\ No newline at end of file
--- a/setup.py
+++ b/setup.py
@@ -81,6 +81,7 @@ setup(
    entry_points='''
        [ckan.plugins]
        odsh=ckanext.odsh.plugin:OdshPlugin
+        statistiknord_harvester=ckanext.odsh.harvesters:StatistikNordHarvester
        [babel.extractors]
        ckan = ckan.lib.extract:extract_ckan