odsh_dcat_de_profile.py

import rdflib
from rdflib.namespace import DCTERMS

from ckan.common import config
import ckan.lib.helpers as helpers
import ckan.model as model
from ckanext.dcat.profiles import DCT
from ckanext.dcat.utils import resource_uri
import ckanext.dcatde.dataset_utils as ds_utils
from ckanext.dcatde.profiles import DCATdeProfile, DCATDE, DCAT, DCATDE_1_0, DCATDE_1_0_1

import ckanext.odsh.helpers as helpers_odsh
import ckanext.odsh.collection.helpers as helpers_collection
import logging

log = logging.getLogger(__name__)

DCT = rdflib.namespace.Namespace("http://purl.org/dc/terms/")
DCAT = rdflib.namespace.Namespace("http://www.w3.org/ns/dcat#")
DCATAP = rdflib.namespace.Namespace("http://data.europa.eu/r5r/")
DCATDE_1_0_2 = rdflib.namespace.Namespace(
    "http://dcat-ap.de/def/dcatde/1.0.2/")
ADMS = rdflib.namespace.Namespace("http://www.w3.org/ns/adms#")


class ODSHDCATdeProfile(DCATdeProfile):

    # from RDF

    def parse_dataset(self, dataset_dict, dataset_ref):
        dataset_dict = super(ODSHDCATdeProfile, self).parse_dataset(
            dataset_dict, dataset_ref
        )
        self._parse_distributions(dataset_dict, dataset_ref)
        self._parse_type(dataset_dict, dataset_ref)
        self._parse_political_geocoding_uri(dataset_dict, dataset_ref)
        self._parse_references(dataset_dict, dataset_ref)
        self._parse_applicable_legislation(dataset_dict, dataset_ref)
        self._parse_hvd_category(dataset_dict, dataset_ref)

        if self._belongs_to_collection(dataset_dict, dataset_ref):
            self._mark_for_adding_to_ckan_collection(dataset_dict, dataset_ref)
        return dataset_dict

    def _parse_distributions(self, dataset_dict, dataset_ref):
        for distribution in self.g.objects(dataset_ref, DCAT.distribution):
            for resource_dict in dataset_dict.get('resources', []):
                # Match distribution in graph and distribution in ckan-dict
                if str(distribution) == resource_uri(resource_dict):
                    for namespace in [DCATDE, DCATDE_1_0, DCATDE_1_0_1, DCATDE_1_0_2]:
                        value = self._object_value(
                            distribution, namespace.licenseAttributionByText)
                        if value:
                            ds_utils.insert_new_extras_field(
                                dataset_dict, 'licenseAttributionByText', value)
                            return

    def _parse_type(self, dataset_dict, dataset_ref):
        dct_type = self._object(dataset_ref, DCT.type)
        if dct_type:
            ckan_type = helpers_odsh.map_dct_type_to_ckan_type(str(dct_type))
            dataset_dict.update({'type': ckan_type})

    def _parse_references(self, dataset_dict, dataset_ref):
        value = self._object(dataset_ref, DCT.references)
        if value:
            ds_utils.insert_new_extras_field(dataset_dict, 'reference', value)

    def _parse_metadata_list(self, dataset_dict, dataset_ref, field_name, rdf_predicate):
        value = ""
        for obj in self.g.objects(dataset_ref, rdf_predicate):
            if value:
                value += "," + str(obj)
            else:
                value = str(obj)
        if value:
            ds_utils.insert_new_extras_field(
                dataset_dict, field_name, "{" + value + "}")

    def _parse_applicable_legislation(self, dataset_dict, dataset_ref):
        self._parse_metadata_list(dataset_dict, dataset_ref, 'applicableLegislation', DCATAP.applicableLegislation)

    def _parse_hvd_category(self, dataset_dict, dataset_ref):
        self._parse_metadata_list(dataset_dict, dataset_ref, 'hvdCategory', DCATAP.hvdCategory)

    def _parse_applicable_legislation(self, dataset_dict, dataset_ref):
        value = ""
        for applicable_legislation in self.g.objects(dataset_ref, DCATAP.applicableLegislation):
            if value:
                value += "," + applicable_legislation
            else:
                value = applicable_legislation
        if value:
            ds_utils.insert_new_extras_field(
                dataset_dict, 'applicableLegislation', "{" + value + "}")

    def _parse_hvd_category(self, dataset_dict, dataset_ref):
        value = ""
        for hvdCategory in self.g.objects(dataset_ref, DCATAP.hvdCategory):
            if value:
                value += "," + hvdCategory
            else:
                value = hvdCategory
        if value:
            ds_utils.insert_new_extras_field(
                dataset_dict, 'hvdCategory', "{" + value + "}")

    def _parse_political_geocoding_uri(self, dataset_dict, dataset_ref):
        """There are datasets that have a politicalGeocodingURI but no spatial_uri."""
        extras = extras=dataset_dict.get('extras')
        political_geocoding_uri = helpers_odsh.odsh_extract_value_from_extras(extras=extras, key='politicalGeocodingURI')
        spatial_uri = helpers_odsh.odsh_extract_value_from_extras(extras=extras, key='spatial_uri')
        if not spatial_uri and political_geocoding_uri:
            if '"' in political_geocoding_uri:
                political_geocoding_uri = political_geocoding_uri.split('"')[1]
            ds_utils.insert_new_extras_field( dataset_dict, 'spatial_uri', political_geocoding_uri)

    def _belongs_to_collection(self, dataset_dict, dataset_ref):
        dct_is_version_of = self._object(dataset_ref, DCT.isVersionOf)
        belongs_to_collection = True if dct_is_version_of else False
        return belongs_to_collection

    def _mark_for_adding_to_ckan_collection(self, dataset_dict, dataset_ref):
        dataset_dict.update({'add_to_collection': True})

    # to RDF

    def graph_from_dataset(self, dataset_dict, dataset_ref):
        '''
        this class inherits from ODSHDCATdeProfile
        it has been extended to add information to
        the rdf export

        '''
        super(ODSHDCATdeProfile, self).graph_from_dataset(
            dataset_dict, dataset_ref)
        self._add_contributor_id(dataset_dict, dataset_ref)
        self._add_license_attribution_by_text(dataset_dict, dataset_ref)
        self._add_type(dataset_dict, dataset_ref)
        self._add_modified_and_issued(dataset_dict, dataset_ref)
        self._add_extra_field(dataset_dict, dataset_ref,
                              'reference', DCT.references)
        self._add_extra_field(dataset_dict, dataset_ref,
                              'applicableLegislation', DCATAP.applicableLegislation)
        self._add_extra_field(dataset_dict, dataset_ref,
                              'hvdCategory', DCATAP.hvdCategory)
        self._add_version(dataset_dict, dataset_ref)
        if self._is_dataset_collection(dataset_dict):
            self._remove_predefined_collection_members()
            self._add_collection_members(dataset_dict, dataset_ref)
        if self._dataset_belongs_to_collection(dataset_dict):
            self._add_collection(dataset_dict, dataset_ref)

    def _add_contributor_id(self, dataset_dict, dataset_ref):
        contributorID = 'http://dcat-ap.de/def/contributors/schleswigHolstein'
        self.g.add(
            (dataset_ref, DCATDE.contributorID,
                rdflib.URIRef(contributorID)
             )
        )

    def _add_license_attribution_by_text(self, dataset_dict, dataset_ref):
        licenseAttributionByText = self._get_dataset_value(
            dataset_dict, 'licenseAttributionByText')
        if licenseAttributionByText:
            self.g.set(
                (dataset_ref, DCATDE.licenseAttributionByText,
                 rdflib.Literal(licenseAttributionByText))
            )
            for distribution in self.g.objects(dataset_ref, DCAT.distribution):
                self.g.set(
                    (distribution, DCATDE.licenseAttributionByText,
                     rdflib.Literal(licenseAttributionByText))
                )

    def _add_extra_field(self, dataset_dict, dataset_ref, field_name, rdf_property):
        '''
        Adds extra fields from a dataset dictionary to a RDF graph.

        Parameters:
        - dataset_dict: A dictionary containing dataset information.
        - dataset_ref: The reference to the dataset in the RDF graph.
        - field_name: The name of the field to extract from the dataset dictionary.
        - rdf_property: The DCAT property to which the field value corresponds.

        This function extracts the value of the specified field from the dataset dictionary,
        and adds it to the RDF graph with the given dataset reference and DCAT property.
        If the field value is a list of values represented as a string enclosed in curly braces,
        it splits the string and adds each value individually to the graph.
        If the field value is a single value, it is added directly to the graph.

        Note: This function includes a workaround since CKAN's extras do not natively support
        lists. It treats lists represented as strings enclosed in curly braces as a single string.
        '''
        field_value_str = dataset_dict.get(field_name)

        if not field_value_str:
            return

        try:
            # Remove curly braces from the string
            values_string = field_value_str.strip('{}')

            # Split the string by commas
            field_values = values_string.split(',')

            # Trim any extra whitespace from each URL
            field_values = [value.strip() for value in field_values]

            # Check if the field value is a list
            if len(field_values) > 1:
                for value in field_values:
                    self.g.add(
                        (dataset_ref, rdf_property, rdflib.URIRef(value))
                    )
            else:
                # Treat it as a single value
                self.g.set(
                    (dataset_ref, rdf_property, rdflib.URIRef(field_values[0]))
                )
        except (ValueError, SyntaxError):
            # If parsing fails, treat the entire string as a single value
            self.g.set(
                (dataset_ref, rdf_property, rdflib.URIRef(field_value_str))
            )

    def _add_modified_and_issued(self, dataset_dict, dataset_ref):
        '''
        Adds distributions last_modified and created values to
        dcat:modified and dcat:issued.
        '''
        for distribution in self.g.objects(dataset_ref, DCAT.distribution):
            for resource_dict in dataset_dict.get('resources', []):
                # Match distribution in graph and distribution in ckan-dict
                if str(distribution) == resource_uri(resource_dict):
                    last_modified = resource_dict.get('last_modified', None)
                    if last_modified:
                        self.g.set(
                            (distribution, DCT.modified, rdflib.Literal(
                                last_modified, datatype="http://www.w3.org/2001/XMLSchema#dateTime"))
                        )
                    created = resource_dict.get('created', None)
                    if created:
                        self.g.set(
                            (distribution, DCT.issued, rdflib.Literal(
                                created, datatype="http://www.w3.org/2001/XMLSchema#dateTime"))
                        )

    def _add_version(self, dataset_dict, dataset_ref):
        '''
        Adds CKAN isReplacedBy extra field to dublin core isReplacedBy and sets version information.
        '''
        new_version = dataset_dict.get('is_replaced_by')
        if new_version:
            self.g.set(
                (dataset_ref, DCTERMS.isReplacedBy,
                    rdflib.URIRef(new_version)
                 )
            )

        version_notes = dataset_dict.get('version_notes')
        if version_notes:
            self.g.set(
                (dataset_ref, ADMS.versionNotes,
                    rdflib.Literal(version_notes)
                 )
            )

    def _add_type(self, dataset_dict, dataset_ref):
        '''
        Adds the type if there is a known mapping from ckan type to dct:type
        '''
        ckan_type = self._get_ckan_type(dataset_dict)
        dct_type = helpers_odsh.map_ckan_type_to_dct_type(ckan_type)
        if dct_type:
            self.g.set(
                (dataset_ref, DCT.type,
                    rdflib.URIRef(dct_type)
                 )
            )

    def _get_ckan_type(self, dataset_dict):
        ckan_type = self._get_dataset_value(dataset_dict, 'type')
        return ckan_type

    def _remove_predefined_collection_members(self):
        for s, p, o in self.g:
            if p == DCT.hasVersion:
                self.g.remove((s, p, o))

    def _add_collection_members(self, dataset_dict, dataset_ref):
        dataset_refs_belonging_to_collection = self._get_dataset_refs_belonging_to_collection(
            dataset_dict)
        for ref in dataset_refs_belonging_to_collection:
            self.g.add(
                (dataset_ref, DCT.hasVersion, rdflib.URIRef(ref))
            )

    def _is_dataset_collection(self, dataset_dict):
        ckan_type = self._get_ckan_type(dataset_dict)
        is_collection = ckan_type == 'collection'
        return is_collection

    def _get_dataset_refs_belonging_to_collection(self, dataset_dict):
        dataset_names = helpers_collection.get_dataset_names(dataset_dict)
        dataset_refs = [self._construct_refs(name) for name in dataset_names]
        return dataset_refs

    @staticmethod
    def _construct_refs(id):
        public_url = config.get('ckan.site_url')
        url_to_id = helpers.url_for('dataset.read', id=id)
        ref = public_url + url_to_id
        return ref

    def _dataset_belongs_to_collection(self, dataset_dict):
        '''
        returns True if a containing collection is found
        '''
        if dataset_dict.get('type') == 'collection':
            return False
        collection_name = helpers_collection.get_collection_id(dataset_dict)
        return collection_name is not None

    def _add_collection(self, dataset_dict, dataset_ref):
        collection_id = helpers_collection.get_collection_id(dataset_dict)
        collection_uri = self._construct_refs(collection_id)
        self.g.set(
            (dataset_ref, DCT.isVersionOf,
                rdflib.URIRef(collection_uri)
             )
        )