From 2e7c7f0d3643ab8a2f4576d60fd7d04051fcb6a7 Mon Sep 17 00:00:00 2001
From: Thorge Petersen <petersen@rz.uni-kiel.de>
Date: Mon, 27 Jan 2025 12:48:38 +0100
Subject: [PATCH] fix: recalculate file attributes (size, hash) on resource
 update

---
 CHANGELOG.md           |  6 +++
 ckanext/odsh/plugin.py | 16 +++++---
 ckanext/odsh/tools.py  | 87 ++++++++++++++++++++++--------------------
 3 files changed, 62 insertions(+), 47 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9895eb16..03c4b99d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [Unreleased]
+
+### Fixed
+
+- Corrected handling of file hash and size updates to ensure they are properly calculated during resource updates and added enhanced error handling for missing files or processing failures during file attribute updates.
+
 ## [2.4.7] - 2025-01-20
 
 ### Added
diff --git a/ckanext/odsh/plugin.py b/ckanext/odsh/plugin.py
index 47f2a9a6..f0e70e3d 100644
--- a/ckanext/odsh/plugin.py
+++ b/ckanext/odsh/plugin.py
@@ -279,13 +279,17 @@ class OdshPlugin(p.SingletonPlugin, DefaultTranslation, tk.DefaultDatasetForm):
         self._update_is_new_in_pkg_dict(pkg_dict)
         return pkg_dict
 
-    def after_dataset_create(self, context, resource):
-        if resource.get('package_id'):
-            tools.add_attributes_resources(context, resource)
+    def after_resource_create(self, context, resource):
+        log.debug('after_resource_create')
 
-    def after_dataset_update(self, context, resource):
-        if resource.get('package_id'):
-            tools.add_attributes_resources(context, resource)
+        if resource.get('url_type') == 'upload':
+            tools.add_resource_attributes(context, resource)
+
+    def after_resource_update(self, context, resource):
+        log.debug('after_resource_update')
+
+        if resource.get('url_type') == 'upload':
+            tools.add_resource_attributes(context, resource)
 
     @staticmethod
     def _update_is_new_in_pkg_dict(pkg_dict):
diff --git a/ckanext/odsh/tools.py b/ckanext/odsh/tools.py
index 63a45750..b3cbd441 100644
--- a/ckanext/odsh/tools.py
+++ b/ckanext/odsh/tools.py
@@ -1,45 +1,50 @@
 import os
+import logging
 from ckanext.odsh.pdf_to_thumbnail.thumbnail import get_resource_path
 from ckanext.odsh.lib.uploader import calculate_hash
 import ckan.plugins.toolkit as toolkit
-#import magic
-#import pdftotext
-
-def add_attributes_resources(context, resource):
-    package_id = resource.get('package_id')
-    package = toolkit.get_action('package_show')(context, {'id': package_id})
-    resources = package.get('resources')
-    i = 0
-    for item in resources:    
-        if item.get('id') == resource.get('id'):
-            path = get_resource_path(resource)
-            if os.path.exists(path):
-                with open(path, 'rb') as file:                  
-                    
-                    #size
-                    if not item.get('size'):
-                        resource_size = os.path.getsize(path)
-                        item.update({'size': resource_size})
-                    
-                    #hash
-                    file.seek(0)
-                    hash = calculate_hash(file)
-                    item.update({'hash':hash})
-                    
-                    #hash algorithm
-                    item.update({'hash_algorithm': 'http://dcat-ap.de/def/hashAlgorithms/md/5'})
-                    
-            
-                    #number of pages
-#                    file_type = magic.from_file(path, mime = True)                    
-#                    if file_type == 'application/pdf':
-#                        file.seek(0)            
-#                        pdf = pdftotext.PDF(file)
-#                        number_of_pages = len(pdf)
-#                        item.update({'number_of_pages':number_of_pages})
-
-                    resources[i] = item 
-            break                         
-        i = i + 1  
-    package.update({'resources':resources})
-    toolkit.get_action('package_update')(context, package)
+
+log = logging.getLogger(__name__)
+
+def add_resource_attributes(context, resource):
+    log.debug("add_resource_attributes")
+
+    # Check if the resource is already processed for this cycle
+    if context.get('resource_processed', False):
+        log.debug("Resource already processed for this cycle, skipping.")
+        return
+  
+    # Return if the resource has no package_id
+    if not resource.get('package_id', False):
+        return
+
+    # Mark the resource as processed for this cycle (in context, not on the resource)
+    context['resource_processed'] = True
+
+    path = get_resource_path(resource)
+    
+    # Check if the path exists and is a file
+    if os.path.isfile(path):
+        try:
+            with open(path, 'rb') as file:
+                # Calculate and update file size if not already present
+                if not resource.get('size'):
+                    resource_size = os.path.getsize(path)
+                    resource.update({'size': resource_size})
+
+                # Calculate and update file hash
+                file.seek(0)  # Ensure we're at the beginning of the file
+                hash = calculate_hash(file)
+                resource.update({'hash': hash})
+
+                # Specify hash algorithm
+                resource.update({'hash_algorithm': 'http://dcat-ap.de/def/hashAlgorithms/md/5'})
+
+                # Update the resource in the system
+                toolkit.get_action('resource_update')(context, resource)
+        
+        except Exception as e:
+            # Handle exceptions that might occur during file reading or hash calculation
+            toolkit.abort(500, f"Error processing resource at {path}: {str(e)}")
+    else:
+        toolkit.abort(404, f"File not found: {path}")
-- 
GitLab