Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
ckanext-odsh
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Open-Data
ckanext-odsh
Commits
a4b82cdf
Commit
a4b82cdf
authored
6 years ago
by
anonymous
Browse files
Options
Downloads
Patches
Plain Diff
ODPSH-16: statistiknordharvester.py calls ckan_mapper
parent
66104383
No related branches found
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
ckanext/odsh/harvesters/statistiknordharvester.py
+33
-108
33 additions, 108 deletions
ckanext/odsh/harvesters/statistiknordharvester.py
with
33 additions
and
108 deletions
ckanext/odsh/harvesters/statistiknordharvester.py
+
33
−
108
View file @
a4b82cdf
import
urllib
import
urllib2
import
urllib2
import
httplib
import
datetime
import
socket
import
traceback
import
traceback
from
ckanext.odsh.harvesters.ckan_mapper
import
pyjq_mapper
from
lxml
import
etree
from
lxml
import
etree
import
uuid
import
uuid
#import json
from
ckan
import
model
,
logic
from
ckan
import
model
from
ckan.logic
import
ValidationError
,
NotFound
,
get_action
from
ckan.logic
import
get_action
from
ckan.lib.helpers
import
json
from
ckan.lib.helpers
import
json
from
ckan.lib.munge
import
munge_name
from
ckan.lib.navl.validators
import
not_empty
from
ckan.plugins
import
toolkit
from
ckan.plugins
import
toolkit
from
ckanext.harvest.model
import
HarvestObject
,
HarvestJob
from
ckanext.harvest.model
import
HarvestObject
from
ckanext.harvest.harvesters.base
import
HarvesterBase
from
ckanext.harvest.harvesters.base
import
HarvesterBase
from
ckanext.odsh.model.statistiknord
import
*
from
ckanext.odsh.model.statistiknord
import
*
import
logging
import
logging
log
=
logging
.
getLogger
(
__name__
)
log
=
logging
.
getLogger
(
__name__
)
...
@@ -44,7 +41,7 @@ class StatistikNordHarvester(HarvesterBase):
...
@@ -44,7 +41,7 @@ class StatistikNordHarvester(HarvesterBase):
documents_list
=
self
.
get_documents_from_content
(
fetched_documents
)
documents_list
=
self
.
get_documents_from_content
(
fetched_documents
)
documents
=
documents_list
[
'
RegistereintragsListe
'
]
documents
=
documents_list
[
'
RegistereintragsListe
'
]
except
Exception
,
e
:
except
Exception
,
e
:
log
.
error
(
'
traceback while reading model: %s
'
%
traceback
.
format_exc
())
#
log.error('traceback while reading model: %s' % traceback.format_exc())
self
.
_save_gather_error
(
'
Statistik-Nord-Harvester: Error while reading model [%r]
'
%
e
,
harvest_job
)
self
.
_save_gather_error
(
'
Statistik-Nord-Harvester: Error while reading model [%r]
'
%
e
,
harvest_job
)
return
False
return
False
...
@@ -55,7 +52,7 @@ class StatistikNordHarvester(HarvesterBase):
...
@@ -55,7 +52,7 @@ class StatistikNordHarvester(HarvesterBase):
try
:
try
:
fetched_values
=
self
.
get_values_from_content
(
document
)
fetched_values
=
self
.
get_values_from_content
(
document
)
identifier
=
self
.
_create_inforeg_id
(
fetched_values
)
identifier
=
self
.
_create_inforeg_id
(
fetched_values
)
log
.
info
(
'
identifier: %s
'
%
identifier
)
#
log.info('identifier: %s' % identifier)
if
identifier
in
used_identifiers
:
if
identifier
in
used_identifiers
:
continue
continue
...
@@ -91,8 +88,8 @@ class StatistikNordHarvester(HarvesterBase):
...
@@ -91,8 +88,8 @@ class StatistikNordHarvester(HarvesterBase):
if
len
(
ids
)
>
0
:
if
len
(
ids
)
>
0
:
log
.
info
(
log
.
info
(
"
finished %s IDs of %s IDs successfully gathered
"
%
(
len
(
used_identifiers
),
len
(
documents
)))
"
finished %s IDs of %s IDs successfully gathered
"
%
(
len
(
used_identifiers
),
len
(
documents
)))
log
.
debug
(
"
List of gathered IDs: %s
"
%
ids
)
#
log.debug("List of gathered IDs: %s" % ids)
log
.
debug
(
"
gather_stage() finished: %s IDs gathered
"
%
len
(
ids
))
#
log.debug("gather_stage() finished: %s IDs gathered" % len(ids))
return
ids
return
ids
else
:
else
:
log
.
error
(
"
No records received
"
)
log
.
error
(
"
No records received
"
)
...
@@ -109,6 +106,7 @@ class StatistikNordHarvester(HarvesterBase):
...
@@ -109,6 +106,7 @@ class StatistikNordHarvester(HarvesterBase):
'
session
'
:
model
.
Session
,
'
session
'
:
model
.
Session
,
'
user
'
:
self
.
_get_user_name
(),
'
user
'
:
self
.
_get_user_name
(),
}
}
#log.debug("user: " + self._get_user_name())
#log.debug("user: " + self._get_user_name())
if
not
harvest_object
:
if
not
harvest_object
:
log
.
error
(
'
Statistik-Nord-Harvester: No harvest object received
'
)
log
.
error
(
'
Statistik-Nord-Harvester: No harvest object received
'
)
...
@@ -118,61 +116,21 @@ class StatistikNordHarvester(HarvesterBase):
...
@@ -118,61 +116,21 @@ class StatistikNordHarvester(HarvesterBase):
self
.
_save_object_error
(
'
Empty content for object %s
'
%
harvest_object
.
id
,
harvest_object
,
u
'
Import
'
)
self
.
_save_object_error
(
'
Empty content for object %s
'
%
harvest_object
.
id
,
harvest_object
,
u
'
Import
'
)
return
False
return
False
else
:
else
:
self
.
dcat_mapper
(
context
,
harvest_object
)
return
True
# A mapper method that maps the content of the harvested object onto the CKAN dataset fields
def
dcat_mapper
(
self
,
context
,
harvest_object
):
values
=
json
.
loads
(
harvest_object
.
content
)
values
=
json
.
loads
(
harvest_object
.
content
)
package_dict
=
dict
()
# use the pyjq lib for the default field mapping
package_dict
.
update
({
'
resources
'
:
[],
'
tags
'
:
[],
'
groups
'
:[]})
package
=
pyjq_mapper
(
values
)
package_dict
.
update
({
'
title
'
:
values
[
'
Titel
'
]})
package_dict
.
update
({
'
notes
'
:
values
[
'
Beschreibung
'
]})
package_dict
.
update
({
'
license_id
'
:
values
[
'
Nutzungsbestimmungen
'
][
'
ID_derLizenz
'
][
0
]})
package_dict
.
update
({
'
author
'
:
values
[
"
VeroeffentlichendeStelle
"
][
"
Name
"
]})
package_dict
.
update
({
'
author_email
'
:
values
[
"
VeroeffentlichendeStelle
"
][
"
EMailAdresse
"
]})
extras
=
list
()
extras
.
append
({
'
key
'
:
'
identifier
'
,
'
value
'
:
self
.
_create_inforeg_id
(
values
)})
package_dict
[
'
extras
'
]
=
extras
if
values
[
'
Ansprechpartner
'
]:
package_dict
.
update
({
'
maintainer
'
:
values
[
'
Ansprechpartner
'
][
'
Name
'
],
'
maintainer_email
'
:
values
[
'
Ansprechpartner
'
][
'
EMailAdresse
'
]})
try
:
package_dict
[
'
url
'
]
=
values
[
'
WeitereInformationen
'
][
'
URL
'
]
except
KeyError
:
package_dict
[
'
url
'
]
=
""
package_dict
.
update
({
'
type
'
:
'
dataset
'
})
resources
=
values
[
'
Ressourcen
'
][
'
Ressource
'
]
for
resource
in
resources
:
resource_dict
=
dict
()
resource_dict
[
'
name
'
]
=
resource
[
'
Ressourcenname
'
]
resource_dict
[
'
format
'
]
=
resource
[
'
Format
'
].
get
(
'
FormatTyp
'
,
""
)
resource_dict
[
'
url
'
]
=
resource
[
'
URLZumDownload
'
]
if
resource
[
'
Dateigroesse
'
]
==
"
0
"
or
len
(
resource
[
'
Dateigroesse
'
])
==
0
:
resource_file
=
urllib2
.
urlopen
(
resource
[
'
url
'
])
resource_dict
[
'
file_size
'
]
=
resource_file
[
'
Content-Length
'
]
else
:
file_size
=
int
(
round
(
float
(
resource
[
'
Dateigroesse
'
])
*
1000000
))
resource_dict
[
'
file_size
'
]
=
file_size
package_dict
[
'
resources
'
].
append
(
resource_dict
)
tags
=
values
[
'
Schlagwoerter
'
][
'
Schlagwort
'
]
for
tag
in
tags
:
seperated_tags
=
tag
.
split
(
'
,
'
)
for
seperated_tag
in
seperated_tags
:
if
seperated_tag
!=
''
and
len
(
seperated_tag
)
<
100
:
package_dict
[
'
tags
'
].
append
({
'
name
'
:
seperated_tag
.
strip
()})
self
.
map_to_group
(
package_dict
,
values
)
# add some meta data that is not part of the harvested_object
source_dataset
=
get_action
(
'
package_show
'
)(
context
.
copy
(),
{
'
id
'
:
harvest_object
.
source
.
id
})
source_dataset
=
get_action
(
'
package_show
'
)(
context
.
copy
(),
{
'
id
'
:
harvest_object
.
source
.
id
})
package_dict
[
'
owner_org
'
]
=
source_dataset
.
get
(
'
owner_org
'
)
package
[
'
owner_org
'
]
=
source_dataset
.
get
(
'
owner_org
'
)
package
[
'
id
'
]
=
str
(
uuid
.
uuid4
())
package_dict
[
'
id
'
]
=
str
(
uuid
.
uuid4
())
package_dict
=
dict
(
package
)
#log.debug(json.dumps(package_dict))
try
:
try
:
result
=
self
.
_create_or_update_package
(
package_dict
,
harvest_object
,
package_dict_form
=
'
package_show
'
)
result
=
self
.
_create_or_update_package
(
package_dict
,
harvest_object
,
package_dict_form
=
'
package_show
'
)
return
result
return
result
...
@@ -180,25 +138,10 @@ class StatistikNordHarvester(HarvesterBase):
...
@@ -180,25 +138,10 @@ class StatistikNordHarvester(HarvesterBase):
self
.
_save_object_error
(
'
Validation Error: %s
'
%
str
(
e
.
error_summary
),
harvest_object
,
'
Import
'
)
self
.
_save_object_error
(
'
Validation Error: %s
'
%
str
(
e
.
error_summary
),
harvest_object
,
'
Import
'
)
return
False
return
False
def
map_to_group
(
self
,
package_dict
,
values
):
# open file with the mapping from numbers to DCAT-DE vocabulary:
with
open
(
'
/usr/lib/ckan/default/src/ckanext-odsh/ckanext/odsh/harvesters/number_dcat_de.json
'
)
as
f
:
dcat_theme
=
json
.
load
(
f
)
# get the code
code
=
values
[
'
StANKategorie
'
]
# if possible map it to a group
if
dcat_theme
.
has_key
(
str
(
code
)):
for
item
in
dcat_theme
[
str
(
code
)]:
package_dict
[
'
groups
'
].
append
({
'
name
'
:
item
})
log
.
debug
(
"
DEBUG: DCAT-DE Code Mapping from %s to %s
"
,
str
(
code
),
item
)
else
:
# no valid group found.
package_dict
[
'
groups
'
].
append
({
'
name
'
:
"
na
"
})
@staticmethod
@staticmethod
def
_get_content
(
url
):
def
_get_content
(
url
):
url
=
url
.
replace
(
'
'
,
'
%20
'
)
url
=
url
.
replace
(
'
'
,
'
%20
'
)
log
.
debug
(
"
get_content StatistikNord harvester: %s
"
%
url
)
#
log.debug("get_content StatistikNord harvester: %s" % url)
try
:
try
:
http_response
=
urllib2
.
urlopen
(
url
,
timeout
=
100000
)
http_response
=
urllib2
.
urlopen
(
url
,
timeout
=
100000
)
content
=
http_response
.
read
()
content
=
http_response
.
read
()
...
@@ -212,9 +155,7 @@ class StatistikNordHarvester(HarvesterBase):
...
@@ -212,9 +155,7 @@ class StatistikNordHarvester(HarvesterBase):
def
get_documents_from_content
(
content
):
def
get_documents_from_content
(
content
):
fetched_xml
=
etree
.
fromstring
(
content
)
fetched_xml
=
etree
.
fromstring
(
content
)
fetched_string
=
etree
.
tostring
(
fetched_xml
)
fetched_string
=
etree
.
tostring
(
fetched_xml
)
fetched_document
=
StatistikNordDocuments
(
fetched_string
)
fetched_document
=
StatistikNordDocuments
(
fetched_string
)
fetched_values
=
fetched_document
.
read_values
()
fetched_values
=
fetched_document
.
read_values
()
return
fetched_values
return
fetched_values
...
@@ -236,22 +177,6 @@ class StatistikNordHarvester(HarvesterBase):
...
@@ -236,22 +177,6 @@ class StatistikNordHarvester(HarvesterBase):
else
:
else
:
return
quelle
+
'
:
'
+
guid
.
strip
()
return
quelle
+
'
:
'
+
guid
.
strip
()
def
add_groups_to_fetched_values
(
self
,
fetched_values
):
groups
=
[]
if
'
StANProdukte
'
in
fetched_values
and
'
4
'
in
fetched_values
[
'
StANProdukte
'
]:
log
.
debug
(
"
Get Groups from database
"
)
groups
=
self
.
get_all_groups
()
#else:
# if 'StANThemen' in fetched_values:
# groups = self.translate_group(fetched_values['StANThemen'])
fetched_values
[
'
Kategorie
'
]
=
{}
fetched_values
[
'
Kategorie
'
][
'
NameDerKategorie
'
]
=
[]
if
groups
:
fetched_values
[
'
Kategorie
'
][
'
NameDerKategorie
'
]
=
groups
log
.
debug
(
fetched_values
[
'
Kategorie
'
][
'
NameDerKategorie
'
])
return
fetched_values
@staticmethod
@staticmethod
def
get_all_groups
():
def
get_all_groups
():
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment