Skip to content
Snippets Groups Projects
odsh_dcat_de_profile.py 14 KiB
Newer Older
  • Learn to ignore specific revisions
  • root's avatar
    root committed
    import rdflib
    
    from rdflib.namespace import DCTERMS
    
    root's avatar
    root committed
    
    from ckan.common import config
    import ckan.lib.helpers as helpers
    import ckan.model as model
    from ckanext.dcat.profiles import DCT
    from ckanext.dcat.utils import resource_uri
    import ckanext.dcatde.dataset_utils as ds_utils
    
    Thorge Petersen's avatar
    Thorge Petersen committed
    from ckanext.dcatde.profiles import DCATdeProfile, DCATDE, DCAT, DCATDE_1_0, DCATDE_1_0_1
    
    root's avatar
    root committed
    
    
    import ckanext.odsh.helpers as helpers_odsh
    
    root's avatar
    root committed
    import ckanext.odsh.collection.helpers as helpers_collection
    
    root's avatar
    root committed
    
    DCT = rdflib.namespace.Namespace("http://purl.org/dc/terms/")
    DCAT = rdflib.namespace.Namespace("http://www.w3.org/ns/dcat#")
    
    Thorge Petersen's avatar
    Thorge Petersen committed
    DCATAP = rdflib.namespace.Namespace("http://data.europa.eu/r5r/")
    
    DCATDE_1_0_2 = rdflib.namespace.Namespace(
        "http://dcat-ap.de/def/dcatde/1.0.2/")
    
    ADMS = rdflib.namespace.Namespace("http://www.w3.org/ns/adms#")
    
    root's avatar
    root committed
    
    
    root's avatar
    root committed
    class ODSHDCATdeProfile(DCATdeProfile):
    
        # from RDF
    
        def parse_dataset(self, dataset_dict, dataset_ref):
            dataset_dict = super(ODSHDCATdeProfile, self).parse_dataset(
                dataset_dict, dataset_ref
            )
            self._parse_distributions(dataset_dict, dataset_ref)
            self._parse_type(dataset_dict, dataset_ref)
    
            self._parse_political_geocoding_uri(dataset_dict, dataset_ref)
    
            self._parse_references(dataset_dict, dataset_ref)
    
            self._parse_applicable_legislation(dataset_dict, dataset_ref)
            self._parse_hvd_category(dataset_dict, dataset_ref)
    
    root's avatar
    root committed
            if self._belongs_to_collection(dataset_dict, dataset_ref):
                self._mark_for_adding_to_ckan_collection(dataset_dict, dataset_ref)
            return dataset_dict
    
    root's avatar
    root committed
        def _parse_distributions(self, dataset_dict, dataset_ref):
            for distribution in self.g.objects(dataset_ref, DCAT.distribution):
                for resource_dict in dataset_dict.get('resources', []):
                    # Match distribution in graph and distribution in ckan-dict
    
    Thorge Petersen's avatar
    Thorge Petersen committed
                    if str(distribution) == resource_uri(resource_dict):
    
                        for namespace in [DCATDE, DCATDE_1_0, DCATDE_1_0_1, DCATDE_1_0_2]:
    
    root's avatar
    root committed
                            value = self._object_value(
                                distribution, namespace.licenseAttributionByText)
                            if value:
                                ds_utils.insert_new_extras_field(
                                    dataset_dict, 'licenseAttributionByText', value)
                                return
    
    root's avatar
    root committed
        def _parse_type(self, dataset_dict, dataset_ref):
            dct_type = self._object(dataset_ref, DCT.type)
            if dct_type:
    
                ckan_type = helpers_odsh.map_dct_type_to_ckan_type(str(dct_type))
    
    root's avatar
    root committed
                dataset_dict.update({'type': ckan_type})
    
        def _parse_references(self, dataset_dict, dataset_ref):
            value = self._object(dataset_ref, DCT.references)
            if value:
                ds_utils.insert_new_extras_field(dataset_dict, 'reference', value)
    
        def _parse_metadata_list(self, dataset_dict, dataset_ref, field_name, rdf_predicate):
            value = ""
            for obj in self.g.objects(dataset_ref, rdf_predicate):
                if value:
                    value += "," + str(obj)
                else:
                    value = str(obj)
            if value:
                ds_utils.insert_new_extras_field(
                    dataset_dict, field_name, "{" + value + "}")
    
        def _parse_applicable_legislation(self, dataset_dict, dataset_ref):
            self._parse_metadata_list(dataset_dict, dataset_ref, 'applicableLegislation', DCATAP.applicableLegislation)
    
        def _parse_hvd_category(self, dataset_dict, dataset_ref):
            self._parse_metadata_list(dataset_dict, dataset_ref, 'hvdCategory', DCATAP.hvdCategory)
    
    
        def _parse_applicable_legislation(self, dataset_dict, dataset_ref):
    
            value = ""
            for applicable_legislation in self.g.objects(dataset_ref, DCATAP.applicableLegislation):
                if value:
                    value += "," + applicable_legislation
                else:
                    value = applicable_legislation
    
                    dataset_dict, 'applicableLegislation', "{" + value + "}")
    
    
        def _parse_hvd_category(self, dataset_dict, dataset_ref):
    
            value = ""
            for hvdCategory in self.g.objects(dataset_ref, DCATAP.hvdCategory):
                if value:
                    value += "," + hvdCategory
                else:
                    value = hvdCategory
    
                    dataset_dict, 'hvdCategory', "{" + value + "}")
    
        def _parse_political_geocoding_uri(self, dataset_dict, dataset_ref):
            """There are datasets that have a politicalGeocodingURI but no spatial_uri."""
            extras = extras=dataset_dict.get('extras')
            political_geocoding_uri = helpers_odsh.odsh_extract_value_from_extras(extras=extras, key='politicalGeocodingURI')
            spatial_uri = helpers_odsh.odsh_extract_value_from_extras(extras=extras, key='spatial_uri')
            if not spatial_uri and political_geocoding_uri:
                if '"' in political_geocoding_uri:
                    political_geocoding_uri = political_geocoding_uri.split('"')[1]
                ds_utils.insert_new_extras_field( dataset_dict, 'spatial_uri', political_geocoding_uri)
    
    
    root's avatar
    root committed
        def _belongs_to_collection(self, dataset_dict, dataset_ref):
            dct_is_version_of = self._object(dataset_ref, DCT.isVersionOf)
            belongs_to_collection = True if dct_is_version_of else False
            return belongs_to_collection
    
    root's avatar
    root committed
        def _mark_for_adding_to_ckan_collection(self, dataset_dict, dataset_ref):
            dataset_dict.update({'add_to_collection': True})
    
    
    root's avatar
    root committed
        def graph_from_dataset(self, dataset_dict, dataset_ref):
            '''
            this class inherits from ODSHDCATdeProfile
            it has been extended to add information to
            the rdf export
    
            '''
            super(ODSHDCATdeProfile, self).graph_from_dataset(
                dataset_dict, dataset_ref)
            self._add_contributor_id(dataset_dict, dataset_ref)
            self._add_license_attribution_by_text(dataset_dict, dataset_ref)
            self._add_type(dataset_dict, dataset_ref)
    
            self._add_modified_and_issued(dataset_dict, dataset_ref)
    
            self._add_extra_field(dataset_dict, dataset_ref,
                                  'reference', DCT.references)
            self._add_extra_field(dataset_dict, dataset_ref,
                                  'applicableLegislation', DCATAP.applicableLegislation)
            self._add_extra_field(dataset_dict, dataset_ref,
                                  'hvdCategory', DCATAP.hvdCategory)
    
            self._add_version(dataset_dict, dataset_ref)
    
    root's avatar
    root committed
            if self._is_dataset_collection(dataset_dict):
                self._remove_predefined_collection_members()
                self._add_collection_members(dataset_dict, dataset_ref)
            if self._dataset_belongs_to_collection(dataset_dict):
                self._add_collection(dataset_dict, dataset_ref)
    
    root's avatar
    root committed
        def _add_contributor_id(self, dataset_dict, dataset_ref):
            contributorID = 'http://dcat-ap.de/def/contributors/schleswigHolstein'
            self.g.add(
    
    root's avatar
    root committed
                    rdflib.URIRef(contributorID)
    
    root's avatar
    root committed
            )
    
    root's avatar
    root committed
        def _add_license_attribution_by_text(self, dataset_dict, dataset_ref):
    
            licenseAttributionByText = self._get_dataset_value(
                dataset_dict, 'licenseAttributionByText')
    
    root's avatar
    root committed
            if licenseAttributionByText:
                self.g.set(
    
                    (dataset_ref, DCATDE.licenseAttributionByText,
                     rdflib.Literal(licenseAttributionByText))
    
    root's avatar
    root committed
                )
                for distribution in self.g.objects(dataset_ref, DCAT.distribution):
                    self.g.set(
    
                        (distribution, DCATDE.licenseAttributionByText,
                         rdflib.Literal(licenseAttributionByText))
    
    root's avatar
    root committed
                    )
    
        def _add_extra_field(self, dataset_dict, dataset_ref, field_name, rdf_property):
            '''
            Adds extra fields from a dataset dictionary to a RDF graph.
    
            Parameters:
            - dataset_dict: A dictionary containing dataset information.
            - dataset_ref: The reference to the dataset in the RDF graph.
            - field_name: The name of the field to extract from the dataset dictionary.
            - rdf_property: The DCAT property to which the field value corresponds.
    
            This function extracts the value of the specified field from the dataset dictionary,
            and adds it to the RDF graph with the given dataset reference and DCAT property.
            If the field value is a list of values represented as a string enclosed in curly braces,
            it splits the string and adds each value individually to the graph.
            If the field value is a single value, it is added directly to the graph.
    
            Note: This function includes a workaround since CKAN's extras do not natively support
            lists. It treats lists represented as strings enclosed in curly braces as a single string.
            '''
            field_value_str = dataset_dict.get(field_name)
    
    Thorge Petersen's avatar
    Thorge Petersen committed
    
    
    Thorge Petersen's avatar
    Thorge Petersen committed
    
    
            try:
                # Remove curly braces from the string
                values_string = field_value_str.strip('{}')
    
                # Split the string by commas
                field_values = values_string.split(',')
    
                # Trim any extra whitespace from each URL
                field_values = [value.strip() for value in field_values]
    
                # Check if the field value is a list
                if len(field_values) > 1:
                    for value in field_values:
                        self.g.add(
                            (dataset_ref, rdf_property, rdflib.URIRef(value))
                        )
                else:
                    # Treat it as a single value
    
    Thorge Petersen's avatar
    Thorge Petersen committed
                    self.g.set(
    
                        (dataset_ref, rdf_property, rdflib.URIRef(field_values[0]))
    
    Thorge Petersen's avatar
    Thorge Petersen committed
                    )
    
            except (ValueError, SyntaxError):
                # If parsing fails, treat the entire string as a single value
    
                    (dataset_ref, rdf_property, rdflib.URIRef(field_value_str))
    
        def _add_modified_and_issued(self, dataset_dict, dataset_ref):
            '''
            Adds distributions last_modified and created values to
            dcat:modified and dcat:issued.
            '''
            for distribution in self.g.objects(dataset_ref, DCAT.distribution):
                for resource_dict in dataset_dict.get('resources', []):
                    # Match distribution in graph and distribution in ckan-dict
    
    Thorge Petersen's avatar
    Thorge Petersen committed
                    if str(distribution) == resource_uri(resource_dict):
    
                        last_modified = resource_dict.get('last_modified', None)
                        if last_modified:
                            self.g.set(
                                (distribution, DCT.modified, rdflib.Literal(
                                    last_modified, datatype="http://www.w3.org/2001/XMLSchema#dateTime"))
                            )
                        created = resource_dict.get('created', None)
                        if created:
                            self.g.set(
                                (distribution, DCT.issued, rdflib.Literal(
                                    created, datatype="http://www.w3.org/2001/XMLSchema#dateTime"))
                            )
    
    
        def _add_version(self, dataset_dict, dataset_ref):
            '''
            Adds CKAN isReplacedBy extra field to dublin core isReplacedBy and sets version information.
            '''
    
            if new_version:
                self.g.set(
                    (dataset_ref, DCTERMS.isReplacedBy,
                        rdflib.URIRef(new_version)
                     )
                )
    
            if version_notes:
                self.g.set(
                    (dataset_ref, ADMS.versionNotes,
                        rdflib.Literal(version_notes)
                     )
                )
    
    root's avatar
    root committed
        def _add_type(self, dataset_dict, dataset_ref):
            '''
    
            Adds the type if there is a known mapping from ckan type to dct:type
    
    root's avatar
    root committed
            '''
            ckan_type = self._get_ckan_type(dataset_dict)
    
            dct_type = helpers_odsh.map_ckan_type_to_dct_type(ckan_type)
    
    root's avatar
    root committed
            if dct_type:
                self.g.set(
    
    root's avatar
    root committed
                        rdflib.URIRef(dct_type)
    
    root's avatar
    root committed
                )
    
    root's avatar
    root committed
        def _get_ckan_type(self, dataset_dict):
            ckan_type = self._get_dataset_value(dataset_dict, 'type')
            return ckan_type
    
    root's avatar
    root committed
        def _remove_predefined_collection_members(self):
            for s, p, o in self.g:
    
    root's avatar
    root committed
                    self.g.remove((s, p, o))
    
    root's avatar
    root committed
        def _add_collection_members(self, dataset_dict, dataset_ref):
    
            dataset_refs_belonging_to_collection = self._get_dataset_refs_belonging_to_collection(
                dataset_dict)
    
    root's avatar
    root committed
            for ref in dataset_refs_belonging_to_collection:
                self.g.add(
                    (dataset_ref, DCT.hasVersion, rdflib.URIRef(ref))
                )
    
    root's avatar
    root committed
        def _is_dataset_collection(self, dataset_dict):
            ckan_type = self._get_ckan_type(dataset_dict)
    
    root's avatar
    root committed
            return is_collection
    
    root's avatar
    root committed
        def _get_dataset_refs_belonging_to_collection(self, dataset_dict):
    
            dataset_names = helpers_collection.get_dataset_names(dataset_dict)
    
            dataset_refs = [self._construct_refs(name) for name in dataset_names]
    
    root's avatar
    root committed
            return dataset_refs
    
    root's avatar
    root committed
        @staticmethod
        def _construct_refs(id):
            public_url = config.get('ckan.site_url')
    
    Thorge Petersen's avatar
    Thorge Petersen committed
            url_to_id = helpers.url_for('dataset.read', id=id)
    
    root's avatar
    root committed
            ref = public_url + url_to_id
            return ref
    
    root's avatar
    root committed
        def _dataset_belongs_to_collection(self, dataset_dict):
            '''
            returns True if a containing collection is found
            '''
    
            if dataset_dict.get('type') == 'collection':
    
    root's avatar
    root committed
                return False
    
            collection_name = helpers_collection.get_collection_id(dataset_dict)
    
    root's avatar
    root committed
    
        def _add_collection(self, dataset_dict, dataset_ref):
    
            collection_id = helpers_collection.get_collection_id(dataset_dict)
    
    root's avatar
    root committed
            collection_uri = self._construct_refs(collection_id)
            self.g.set(
    
    root's avatar
    root committed
                    rdflib.URIRef(collection_uri)