diff --git a/CHANGELOG.md b/CHANGELOG.md index 9895eb16b4f7a5b321865421c3aeda24a6eae470..eeeab63a6eb508737cac47e951c9272407cac2c1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,16 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unreleased] + +### Fixed + +- Corrected handling of file hash and size updates to ensure they are properly calculated during resource updates and added enhanced error handling for missing files or processing failures during file attribute updates. + +### Removed + +- Completely removed the legacy plugin responsible for creating thumbnails, including all associated code and functionality. + ## [2.4.7] - 2025-01-20 ### Added diff --git a/ckanext/odsh/pdf_to_thumbnail/__init__.py b/ckanext/odsh/pdf_to_thumbnail/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/ckanext/odsh/pdf_to_thumbnail/action.py b/ckanext/odsh/pdf_to_thumbnail/action.py deleted file mode 100644 index d4a19d19f6eef062e52a7efc2b8e039a8d6af15e..0000000000000000000000000000000000000000 --- a/ckanext/odsh/pdf_to_thumbnail/action.py +++ /dev/null @@ -1,29 +0,0 @@ -# ckan -import ckan.plugins.toolkit as toolkit -import ckan.lib.helpers as helpers -from ckan.logic.action.update import package_update -from ckan.logic.action.delete import package_delete - -from . import thumbnail - - -def before_package_delete(context, package_id_dict): - pkg_dict = toolkit.get_action('package_show')(context, package_id_dict) - if helpers.check_access('package_delete', pkg_dict): - thumbnail.remove_thumbnail(context) - return package_delete(context, package_id_dict) - -def before_package_update(context, pkg_dict): - if helpers.check_access('package_update', pkg_dict): - package_id =pkg_dict.get('id') - package = toolkit.get_action('package_show')(context, {'id': package_id}) - old_private = package.get('private') - new_private = pkg_dict.get('private') - old_filename = package.get('thumbnail') - if old_filename: - if str(old_private) != str(new_private): - new_filename = thumbnail.rename_thumbnail_to_random_name(old_filename) - pkg_dict['extras'].append({'key': 'thumbnail', 'value': new_filename}) - elif not pkg_dict.get('thumbnail'): - pkg_dict['extras'].append({'key': 'thumbnail', 'value': old_filename}) - return package_update(context, pkg_dict) \ No newline at end of file diff --git a/ckanext/odsh/pdf_to_thumbnail/helpers.py b/ckanext/odsh/pdf_to_thumbnail/helpers.py deleted file mode 100644 index 82a5d15cdc6cfe3f700bf756579cdb11591fcddd..0000000000000000000000000000000000000000 --- a/ckanext/odsh/pdf_to_thumbnail/helpers.py +++ /dev/null @@ -1,26 +0,0 @@ - -from ckan.lib.helpers import is_url, url_for - -def thumbnail_namespace(filename): - return "/" + filename - -def get_download_link_for_thumbnail(package): - resources = package.get('resources') - for resource in resources[::-1]: - url_type =resource.get('url_type') - mimetype = resource.get('mimetype') - if url_type == 'upload' and mimetype == 'application/pdf': - package_id = resource.get('package_id') - resource_id = resource.get('id') - pre_resource_url = resource.get('url') - if is_url(pre_resource_url): - url_resource = pre_resource_url - else: - url_resource = url_for(named_route='dataset.resource_download', - id=package_id, - resource_id=resource_id, - filename=pre_resource_url, - qualified = True) - - - return url_resource diff --git a/ckanext/odsh/pdf_to_thumbnail/plugin.py b/ckanext/odsh/pdf_to_thumbnail/plugin.py deleted file mode 100644 index f0c5f4c005e239853720d4d3b737ed49a346a17e..0000000000000000000000000000000000000000 --- a/ckanext/odsh/pdf_to_thumbnail/plugin.py +++ /dev/null @@ -1,58 +0,0 @@ -import os - -#from ckan -import ckan.plugins as plugins - -#pdf_to_thumbnail -from . import thumbnail -from . import action as thumbnail_action -from . import helpers as thumbnail_helpers - -import logging -log = logging.getLogger(__name__) - - -class ThumbnailPlugin(plugins.SingletonPlugin): - plugins.implements(plugins.IResourceController, inherit=True) - plugins.implements(plugins.IConfigurer, inherit=True) - plugins.implements(plugins.IActions, inherit=True) - plugins.implements(plugins.ITemplateHelpers) - - -#IResourceController - def after_resource_create(self, context, resource): - resources = thumbnail.resources_of_containing_package(resource) - thumbnail.create_thumbnail_if_none_in_package(context, resources) - - def after_resource_update(self, context, resource): - resources = thumbnail.resources_of_containing_package(resource) - thumbnail.create_thumbnail_if_none_in_package(context, resources) - - def after_resource_delete(self, context, resources): - thumbnail.create_thumbnail_if_none_in_package(context, resources) - -#IConfigurer - - def update_config(self, config_): - storage_path = config_.get('ckan.storage_path') - public_dir = os.path.join(storage_path, 'thumbnail') - if config_.get('extra_public_paths'): - config_['extra_public_paths'] += ',' + public_dir - else: - config_['extra_public_paths'] = public_dir - -#IActions - - def get_actions(self): - return {'package_delete': thumbnail_action.before_package_delete, - 'package_update': thumbnail_action.before_package_update - } - -#ITemplateHelpers - - def get_helpers(self): - - return { - 'thumbnail_namespace':thumbnail_helpers.thumbnail_namespace, - 'thumbail_get_download_link':thumbnail_helpers.get_download_link_for_thumbnail - } diff --git a/ckanext/odsh/pdf_to_thumbnail/thumbnail.py b/ckanext/odsh/pdf_to_thumbnail/thumbnail.py deleted file mode 100644 index 25067f73e42c669c15593b929698b3b13fec6c0e..0000000000000000000000000000000000000000 --- a/ckanext/odsh/pdf_to_thumbnail/thumbnail.py +++ /dev/null @@ -1,223 +0,0 @@ -import os -import magic -from pdf2image import convert_from_bytes -import logging -from ckan.common import config -import urllib.request, urllib.error, urllib.parse - -from binascii import b2a_hex -import ckan.plugins.toolkit as toolkit - - -log = logging.getLogger(__name__) - - -def create_thumbnail(context, resource): - ''' - main entry point into this module - this function is called from pdf_to_thumbnail.plugin - ''' - old_filename = _get_filename_from_context(context) - url_type = resource.get('url_type') - if url_type == 'upload': - is_PDF, filename = _create_thumbnail_from_memory(resource, old_filename) - else: - is_PDF, filename = (False, None) - return is_PDF, filename - - -def _get_filename_from_context(context): - package = context.get('package') - package_id = package.id - package= toolkit.get_action('package_show')(None, {'id': package_id}) - thumbnail = package.get('thumbnail') - return thumbnail - - -def _create_thumbnail_from_memory(resource, old_filename): - filepath = get_resource_path(resource) - is_PDF = _is_pdf(filepath) - if is_PDF: - with open(filepath, 'rb') as file: - new_filename = _create_thumbnail_from_file(file) - if old_filename: - ThumbnailPath.from_filename(old_filename).remove() - return is_PDF, new_filename - else: - return is_PDF, None - - -def get_resource_path(resource): - # see https://stackoverflow.com/questions/46572402/where-does-ckan-store-the-files-pushed-to-datastore-filestore - resource_id = resource.get('id') - filepath = os.path.join( - config.get('ckan.storage_path'), - 'resources', - resource_id[0:3], - resource_id[3:6], - resource_id[6:] - ) - return filepath - - -def _is_pdf(filepath): - file_type = magic.from_file(filepath, mime = True) - return file_type == 'application/pdf' - - -def _create_thumbnail_from_file(file): - width = config.get('ckan.thumbnail.size.width', 410) - new_thumbnail = ThumbnailPath.from_unique_random_name() - file.seek(0) - file_read = file.read() - convert_from_bytes( - file_read, - size=(width, None), - output_folder=new_thumbnail.folder, - output_file=new_thumbnail.filename, - single_file=True, - first_page=0, - last_page=0, - fmt='jpg' - ) - return new_thumbnail.filename_with_extension - - -def thumbnail_folder(): - return os.path.join( - config.get('ckan.storage_path'), - 'thumbnail', - ) - - -def rename_thumbnail_to_random_name(old_filename): - ''' - used by pdf_to_thumbnail.action - ''' - old_filepath = ThumbnailPath.from_filename_with_extension(old_filename) - new_filepath = ThumbnailPath.from_unique_random_name() - try: - os.renames(old_filepath.full_filename, new_filepath.full_filename) - return new_filepath.filename_with_extension - except OSError: - log.warning('The file path "{}" of package was not found.'.format(old_filepath)) - - -def remove_thumbnail(context): - ''' - used by pdf_to_thumbnail.action - ''' - old_filename = _get_filename_from_context(context) - if old_filename: - ThumbnailPath.from_filename_with_extension(old_filename).remove() - - -def resources_of_containing_package(resource): - #todo: change arg order - ''' - used by pdf_to_thumbnail.plugin - ''' - package_id = resource.get('package_id') - package = toolkit.get_action('package_show')(None, {'id': package_id}) - resources = package.get('resources') - return resources - - -def create_thumbnail_if_none_in_package(context, resources): - ''' - used by pdf_to_thumbnail.plugin - loops through a package's resources in the order they have been uploaded - and for each tries to create a thumbnail until it succeeds. - If the package already has a thumbnail the creation step is skipped - ''' - package_dict = _get_package_dict_from_context(context) - if not _has_thumbnail(package_dict): - any(_try_create_thumbnail(context, r) for r in resources) - - -def _get_package_dict_from_context(context): - package_id = context.get('package').id - package_dict = toolkit.get_action('package_show')(None, {'id': package_id}) - return package_dict - - -def _has_thumbnail(package_dict): - thumbnail = package_dict.get('thumbnail') - return bool(thumbnail) - - -def _try_create_thumbnail(context, resource): - is_PDF, filename = create_thumbnail(context, resource) - success = is_PDF - if success: - _write_thumbnail_into_package(context, filename) - return success - - -def _write_thumbnail_into_package(context, filename): - package_dict = _get_package_dict_from_context(context) - if filename: - package_dict.update({'thumbnail': filename}) - toolkit.get_action('package_update')(None, package_dict) - - -class ThumbnailPath(object): - ''' - utility class to manage the path of thumbnail pictures - ''' - - def __init__(self, folder, filename, extension): - self.folder = folder - self.filename = filename - self.extension = extension - - _EXTENSION = '.jpg' - - @staticmethod - def from_filename(filename): - ''' - filename without extension (i.e. '.jpg') - ''' - return ThumbnailPath(thumbnail_folder(), filename, ThumbnailPath._EXTENSION) - - @staticmethod - def from_filename_with_extension(filename_with_extension): - ''' - limited to one dot in filename - ''' - tokens = filename_with_extension.split('.') - if len(tokens) == 1: - filename = filename_with_extension - extension = '' - else: - filename = '.'.join(tokens[:-1]) - extension = '.'.join(['', tokens[-1]]) - return ThumbnailPath(thumbnail_folder(), filename, extension) - - @staticmethod - def from_unique_random_name(): - thumbnail_path = ThumbnailPath._from_random_name() - if thumbnail_path.exists(): - return ThumbnailPath.from_unique_random_name() - return thumbnail_path - - @staticmethod - def _from_random_name(): - number = b2a_hex(os.urandom(15)) - filename = 'thumbnail_picture_' + str(number) - return ThumbnailPath.from_filename(filename) - - @property - def filename_with_extension(self): - return self.filename + self.extension - - @property - def full_filename(self): - return os.path.join(self.folder, self.filename_with_extension) - - def exists(self): - return os.path.exists(self.full_filename) - - def remove(self): - if os.path.exists(self.full_filename): - os.remove(self.full_filename) diff --git a/ckanext/odsh/plugin.py b/ckanext/odsh/plugin.py index 47f2a9a60a395b39fc207d0b7ce607f460784d3e..9217d80042937e443ed645eb904b8c480e0f5270 100644 --- a/ckanext/odsh/plugin.py +++ b/ckanext/odsh/plugin.py @@ -168,10 +168,6 @@ class OdshPlugin(p.SingletonPlugin, DefaultTranslation, tk.DefaultDatasetForm): tk.get_validator('ignore_missing'), tk.get_converter('convert_to_extras') ], - 'thumbnail': [ - tk.get_validator('ignore_missing'), - tk.get_converter('convert_to_extras') - ], 'relatedPackage': [ tk.get_validator('validate_relatedPackage'), tk.get_converter('convert_to_extras') @@ -197,10 +193,6 @@ class OdshPlugin(p.SingletonPlugin, DefaultTranslation, tk.DefaultDatasetForm): tk.get_converter('convert_from_extras'), tk.get_validator('ignore_missing') ], - 'thumbnail': [ - tk.get_converter('convert_from_extras'), - tk.get_validator('ignore_missing') - ], 'relatedPackage': [ tk.get_converter('convert_from_extras'), tk.get_validator('ignore_missing') @@ -279,13 +271,17 @@ class OdshPlugin(p.SingletonPlugin, DefaultTranslation, tk.DefaultDatasetForm): self._update_is_new_in_pkg_dict(pkg_dict) return pkg_dict - def after_dataset_create(self, context, resource): - if resource.get('package_id'): - tools.add_attributes_resources(context, resource) + def after_resource_create(self, context, resource): + log.debug('after_resource_create') + + if resource.get('url_type') == 'upload': + tools.add_resource_attributes(context, resource) + + def after_resource_update(self, context, resource): + log.debug('after_resource_update') - def after_dataset_update(self, context, resource): - if resource.get('package_id'): - tools.add_attributes_resources(context, resource) + if resource.get('url_type') == 'upload': + tools.add_resource_attributes(context, resource) @staticmethod def _update_is_new_in_pkg_dict(pkg_dict): diff --git a/ckanext/odsh/templates/snippets/package_item.html b/ckanext/odsh/templates/snippets/package_item.html index f708758ef23fd045487a1fc65d8f8f8ff13fccf7..4674f7f4f40429472c9aa0f12387ab46e600638e 100644 --- a/ckanext/odsh/templates/snippets/package_item.html +++ b/ckanext/odsh/templates/snippets/package_item.html @@ -29,7 +29,6 @@ Example: {% set daterange = h.get_daterange_prettified(package) %} {% set language_of_package = h.get_language_of_package(package) %} {% set language_icon = h.get_language_icon(package) %} -{% set thumbnail = package.get('thumbnail') %} {% block package_item %} <div class="odsh-dataset-item"> diff --git a/ckanext/odsh/tools.py b/ckanext/odsh/tools.py index 63a45750be357ee61373e13d3f05f17ad1d811d3..dd088a5f4b4017be33c770d73941f84fab407314 100644 --- a/ckanext/odsh/tools.py +++ b/ckanext/odsh/tools.py @@ -1,45 +1,62 @@ import os -from ckanext.odsh.pdf_to_thumbnail.thumbnail import get_resource_path +import logging from ckanext.odsh.lib.uploader import calculate_hash +from ckan.common import config import ckan.plugins.toolkit as toolkit -#import magic -#import pdftotext - -def add_attributes_resources(context, resource): - package_id = resource.get('package_id') - package = toolkit.get_action('package_show')(context, {'id': package_id}) - resources = package.get('resources') - i = 0 - for item in resources: - if item.get('id') == resource.get('id'): - path = get_resource_path(resource) - if os.path.exists(path): - with open(path, 'rb') as file: - - #size - if not item.get('size'): - resource_size = os.path.getsize(path) - item.update({'size': resource_size}) - - #hash - file.seek(0) - hash = calculate_hash(file) - item.update({'hash':hash}) - - #hash algorithm - item.update({'hash_algorithm': 'http://dcat-ap.de/def/hashAlgorithms/md/5'}) - - - #number of pages -# file_type = magic.from_file(path, mime = True) -# if file_type == 'application/pdf': -# file.seek(0) -# pdf = pdftotext.PDF(file) -# number_of_pages = len(pdf) -# item.update({'number_of_pages':number_of_pages}) - - resources[i] = item - break - i = i + 1 - package.update({'resources':resources}) - toolkit.get_action('package_update')(context, package) + +log = logging.getLogger(__name__) + +def add_resource_attributes(context, resource): + log.debug("add_resource_attributes") + + # Check if the resource is already processed for this cycle + if context.get('resource_processed', False): + log.debug("Resource already processed for this cycle, skipping.") + return + + # Return if the resource has no package_id + if not resource.get('package_id', False): + return + + # Mark the resource as processed for this cycle (in context, not on the resource) + context['resource_processed'] = True + + path = _get_resource_path(resource) + + # Check if the path exists and is a file + if os.path.isfile(path): + try: + with open(path, 'rb') as file: + # Calculate and update file size if not already present + if not resource.get('size'): + resource_size = os.path.getsize(path) + resource.update({'size': resource_size}) + + # Calculate and update file hash + file.seek(0) # Ensure we're at the beginning of the file + hash = calculate_hash(file) + resource.update({'hash': hash}) + + # Specify hash algorithm + resource.update({'hash_algorithm': 'http://dcat-ap.de/def/hashAlgorithms/md/5'}) + + # Update the resource in the system + toolkit.get_action('resource_update')(context, resource) + + except Exception as e: + # Handle exceptions that might occur during file reading or hash calculation + toolkit.abort(500, f"Error processing resource at {path}: {str(e)}") + else: + toolkit.abort(404, f"File not found: {path}") + +def _get_resource_path(resource): + # see https://stackoverflow.com/questions/46572402/where-does-ckan-store-the-files-pushed-to-datastore-filestore + resource_id = resource.get('id') + filepath = os.path.join( + config.get('ckan.storage_path'), + 'resources', + resource_id[0:3], + resource_id[3:6], + resource_id[6:] + ) + return filepath \ No newline at end of file diff --git a/setup.py b/setup.py index 545bc06b5ea6f496f1c42502fb37c9e87afb5ee8..00ee552eb63fca4a81569c41d8e3eed1dc60fa82 100755 --- a/setup.py +++ b/setup.py @@ -86,7 +86,6 @@ setup( odsh_autocomplete=ckanext.odsh.plugin_odsh_autocomplete:OdshAutocompletePlugin odsh_dcat_harvest=ckanext.odsh.plugin_odsh_dcat_harvest:OdshDCATHarvestPlugin odsh_collections=ckanext.odsh.collection.plugin:CollectionsPlugin - thumbnail=ckanext.odsh.pdf_to_thumbnail.plugin:ThumbnailPlugin [paste.paster_command] odsh_initialization = ckanext.odsh.commands.initialization:Initialization