From 73a851de4ef0865a90c0c534e3bc2c4492d8b200 Mon Sep 17 00:00:00 2001
From: anonymous <anonymous>
Date: Fri, 14 Sep 2018 13:19:48 +0200
Subject: [PATCH] adds the first working iteration of the Kiel harvester

---
 ckanext/odsh/harvesters/__init__.py      |   1 +
 ckanext/odsh/harvesters/kielharvester.py | 115 +++++++++++++++++++++++
 setup.py                                 |   1 +
 3 files changed, 117 insertions(+)
 create mode 100755 ckanext/odsh/harvesters/kielharvester.py
 mode change 100644 => 100755 setup.py

diff --git a/ckanext/odsh/harvesters/__init__.py b/ckanext/odsh/harvesters/__init__.py
index 8ac43649..9b5e8fc7 100755
--- a/ckanext/odsh/harvesters/__init__.py
+++ b/ckanext/odsh/harvesters/__init__.py
@@ -7,3 +7,4 @@ except ImportError:
     __path__ = pkgutil.extend_path(__path__, __name__)
 
 from ckanext.odsh.harvesters.statistiknordharvester import StatistikNordHarvester
+from ckanext.odsh.harvesters.kielharvester import KielHarvester
diff --git a/ckanext/odsh/harvesters/kielharvester.py b/ckanext/odsh/harvesters/kielharvester.py
new file mode 100755
index 00000000..476d9c4d
--- /dev/null
+++ b/ckanext/odsh/harvesters/kielharvester.py
@@ -0,0 +1,115 @@
+from ckan import model
+from ckan.logic import get_action
+from ckan.plugins import toolkit
+from ckanext.harvest.harvesters.base import HarvesterBase
+from ckanext.harvest.model import HarvestObject
+
+import requests
+import uuid
+import traceback
+import json
+import logging
+import datetime
+
+log = logging.getLogger(__name__)
+
+
+class KielHarvester(HarvesterBase):
+    '''
+    A Harvester for Kiel Open Data
+    '''
+
+    @staticmethod
+    def info():
+        return {
+            'name': 'kiel',
+            'title': 'Kiel Open Data',
+            'description': 'Harvests Kiel Open Data',
+            'form_config_interface': 'Text'
+        }
+
+    def gather_stage(self, harvest_job):
+        url = harvest_job.source.url
+        datasets = requests.get(url=url).json()
+
+        try:
+            used_identifiers = []
+            ids = []
+            for dataset in datasets:
+                guid = str(uuid.uuid3(uuid.NAMESPACE_URL, dataset.get("url").encode('ascii', 'ignore')))
+                obj = HarvestObject(job=harvest_job, guid=guid)
+                obj.content = json.dumps(dataset)
+                obj.save()
+                log.info("harvest_object_id: %s, GUID: %s successfully gathered " % (str(obj.id), str(obj.guid)))
+                used_identifiers.append(guid)
+                ids.append(obj.id)
+
+        except Exception, e:
+            self._save_gather_error(
+                'Statistik-Nord-Harvester: Error gathering the identifiers from the source server [%s]' % str(e),
+                harvest_job)
+            log.error(e)
+            return None
+
+        if len(ids) > 0:
+            log.info(
+                "finished %s IDs of %s IDs successfully gathered" % (len(used_identifiers), len(datasets)))
+            log.debug("List of gathered IDs: %s" % ids)
+            log.debug("gather_stage() finished: %s IDs gathered" % len(ids))
+            return ids
+        else:
+            log.error("No records received")
+            self._save_gather_error("Couldn't find any metadata files", harvest_job)
+            return None
+
+    @staticmethod
+    def fetch_stage(harvest_object):
+        if harvest_object:
+            return True
+        else:
+            return False
+
+    def import_stage(self, harvest_object):
+        context = {
+            'model': model,
+            'session': model.Session,
+            'user': self._get_user_name(),
+        }
+        if not harvest_object:
+            log.error('Kiel-Harvester: No harvest object received')
+            return False
+
+        if harvest_object.content is None:
+            self._save_object_error('Empty content for object %s' % harvest_object.id, harvest_object, u'Import')
+            return False
+        else:
+            package_dict = json.loads(harvest_object.content)
+            published = str()
+            for date in package_dict['extras']['dates']:
+                if date['role'] == 'veroeffentlicht':
+                    published = date['date']
+            package_dict['metadata_modified'] = datetime.datetime.strptime(published, "%d.%m.%Y")
+            source_dataset = get_action('package_show')(context.copy(), {'id': harvest_object.source.id})
+            package_dict['owner_org'] = source_dataset.get('owner_org')
+
+            if package_dict['type'] == 'datensatz':
+                package_dict['type'] = 'dataset'
+            package_dict['id'] = harvest_object.guid
+            package_dict['groups'] = list()
+            package_dict['extras'] = list()
+
+            tags = package_dict['tags']
+            package_dict['tags'] = list()
+            for tag in tags:
+                seperated_tags = tag.split(',')
+                for seperated_tag in seperated_tags:
+                    if seperated_tag != '' and len(seperated_tag) < 100:
+                        package_dict['tags'].append({'name': seperated_tag.strip()})
+
+#            log.debug(json.dumps(package_dict))
+            try:
+                result = self._create_or_update_package(package_dict, harvest_object, package_dict_form='package_show')
+                return result
+            except toolkit.ValidationError, e:
+                self._save_object_error('Validation Error: %s' % str(e.error_summary), harvest_object, 'Import')
+                return False
diff --git a/setup.py b/setup.py
old mode 100644
new mode 100755
index 8f6b6e37..c716971c
--- a/setup.py
+++ b/setup.py
@@ -82,6 +82,7 @@ setup(
         [ckan.plugins]
         odsh=ckanext.odsh.plugin:OdshPlugin
         statistiknord_harvester=ckanext.odsh.harvesters:StatistikNordHarvester
+        kiel_harvester=ckanext.odsh.harvesters:KielHarvester
 
         [babel.extractors]
         ckan = ckan.lib.extract:extract_ckan
-- 
GitLab