fix: recalculate file attributes (size, hash) on resource update

2e7c7f0d · Thorge Petersen · 7a00c063 · 2e7c7f0d · 2e7c7f0d · 2e7c7f0d
Verified Commit 2e7c7f0d authored 2 months ago by Thorge Petersen
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [Unreleased]
+### Fixed
+- Corrected handling of file hash and size updates to ensure they are properly calculated during resource updates and added enhanced error handling for missing files or processing failures during file attribute updates.
 ## [2.4.7] - 2025-01-20
 ### Added

--- a/ckanext/odsh/plugin.py
+++ b/ckanext/odsh/plugin.py
@@ -279,13 +279,17 @@ class OdshPlugin(p.SingletonPlugin, DefaultTranslation, tk.DefaultDatasetForm):
        self._update_is_new_in_pkg_dict(pkg_dict)
        return pkg_dict
-    def after_dataset_create(self, context, resource):
+    def after_resource_create(self, context, resource):
-        if resource.get('package_id'):
+        log.debug('after_resource_create')
-            tools.add_attributes_resources(context, resource)
-    def after_dataset_update(self, context, resource):
+        if resource.get('url_type') == 'upload':
-        if resource.get('package_id'):
+            tools.add_resource_attributes(context, resource)
-            tools.add_attributes_resources(context, resource)
+    def after_resource_update(self, context, resource):
+        log.debug('after_resource_update')
+        if resource.get('url_type') == 'upload':
+            tools.add_resource_attributes(context, resource)
    @staticmethod
    def _update_is_new_in_pkg_dict(pkg_dict):

--- a/ckanext/odsh/tools.py
+++ b/ckanext/odsh/tools.py
 import os
+import logging
 from ckanext.odsh.pdf_to_thumbnail.thumbnail import get_resource_path
 from ckanext.odsh.lib.uploader import calculate_hash
 import ckan.plugins.toolkit as toolkit
-#import magic
-#import pdftotext
+log = logging.getLogger(__name__)
-def add_attributes_resources(context, resource):
+def add_resource_attributes(context, resource):
-    package_id = resource.get('package_id')
+    log.debug("add_resource_attributes")
-    package = toolkit.get_action('package_show')(context, {'id': package_id})
-    resources = package.get('resources')
+    # Check if the resource is already processed for this cycle
-    i = 0
+    if context.get('resource_processed', False):
-    for item in resources:    
+        log.debug("Resource already processed for this cycle, skipping.")
-        if item.get('id') == resource.get('id'):
+        return
-            path = get_resource_path(resource)
-            if os.path.exists(path):
+    # Return if the resource has no package_id
-                with open(path, 'rb') as file:                  
+    if not resource.get('package_id', False):
+        return
-                    #size
-                    if not item.get('size'):
+    # Mark the resource as processed for this cycle (in context, not on the resource)
-                        resource_size = os.path.getsize(path)
+    context['resource_processed'] = True
-                        item.update({'size': resource_size})
+    path = get_resource_path(resource)
-                    #hash
-                    file.seek(0)
+    # Check if the path exists and is a file
-                    hash = calculate_hash(file)
+    if os.path.isfile(path):
-                    item.update({'hash':hash})
+        try:
+            with open(path, 'rb') as file:
-                    #hash algorithm
+                # Calculate and update file size if not already present
-                    item.update({'hash_algorithm': 'http://dcat-ap.de/def/hashAlgorithms/md/5'})
+                if not resource.get('size'):
+                    resource_size = os.path.getsize(path)
+                    resource.update({'size': resource_size})
-                    #number of pages
-#                    file_type = magic.from_file(path, mime = True)                    
+                # Calculate and update file hash
-#                    if file_type == 'application/pdf':
+                file.seek(0)  # Ensure we're at the beginning of the file
-#                        file.seek(0)            
+                hash = calculate_hash(file)
-#                        pdf = pdftotext.PDF(file)
+                resource.update({'hash': hash})
-#                        number_of_pages = len(pdf)
-#                        item.update({'number_of_pages':number_of_pages})
+                # Specify hash algorithm
+                resource.update({'hash_algorithm': 'http://dcat-ap.de/def/hashAlgorithms/md/5'})
-                    resources[i] = item 
-            break                         
+                # Update the resource in the system
-        i = i + 1  
+                toolkit.get_action('resource_update')(context, resource)
-    package.update({'resources':resources})
-    toolkit.get_action('package_update')(context, package)
+        except Exception as e:
+            # Handle exceptions that might occur during file reading or hash calculation
+            toolkit.abort(500, f"Error processing resource at {path}: {str(e)}")
+    else:
+        toolkit.abort(404, f"File not found: {path}")