From 0bd7130aa664b15a6b604e9bd6e7a5916db5209d Mon Sep 17 00:00:00 2001
From: Jesper Zedlitz <jesper@zedlitz.de>
Date: Fri, 14 Aug 2020 13:58:40 +0200
Subject: [PATCH] pdf_to_thumbnail aus Transparenzportal

---
 ckanext/odsh/pdf_to_thumbnail/action.py    |   5 +-
 ckanext/odsh/pdf_to_thumbnail/plugin.py    |  12 +-
 ckanext/odsh/pdf_to_thumbnail/thumbnail.py | 321 ++++++++++++---------
 3 files changed, 197 insertions(+), 141 deletions(-)

diff --git a/ckanext/odsh/pdf_to_thumbnail/action.py b/ckanext/odsh/pdf_to_thumbnail/action.py
index bd5d7130..d77845be 100644
--- a/ckanext/odsh/pdf_to_thumbnail/action.py
+++ b/ckanext/odsh/pdf_to_thumbnail/action.py
@@ -4,8 +4,7 @@ import ckan.lib.helpers as helpers
 from ckan.logic.action.update import package_update
 from ckan.logic.action.delete import package_delete
 
-#from thumbnail
-import thumbnail as thumbnail
+import thumbnail
 
 
 def before_package_delete(context, package_id_dict):
@@ -23,7 +22,7 @@ def before_package_update(context, pkg_dict):
         old_filename = package.get('thumbnail')
         if old_filename:
             if str(old_private) != str(new_private):
-                new_filename = thumbnail.change_filepath(old_filename)
+                new_filename = thumbnail.rename_thumbnail_to_random_name(old_filename)
                 pkg_dict['extras'].append({'key': 'thumbnail', 'value': new_filename})
             elif not pkg_dict.get('thumbnail'): 
                 pkg_dict['extras'].append({'key': 'thumbnail', 'value': old_filename})
diff --git a/ckanext/odsh/pdf_to_thumbnail/plugin.py b/ckanext/odsh/pdf_to_thumbnail/plugin.py
index 71523599..ecd4fdef 100644
--- a/ckanext/odsh/pdf_to_thumbnail/plugin.py
+++ b/ckanext/odsh/pdf_to_thumbnail/plugin.py
@@ -1,6 +1,5 @@
 import os 
 
-
 #from ckan
 import ckan.plugins as plugins
 
@@ -21,15 +20,16 @@ class ThumbnailPlugin(plugins.SingletonPlugin):
 
 
 #IResourceController
-    def after_create(self, context, resource):        
-        _, filename = thumbnail.create_thumbnail(context, resource)
-        thumbnail.write_thumbnail_into_package(context, resource, filename)
+    def after_create(self, context, resource):
+        resources = thumbnail.resources_of_containing_package(resource)
+        thumbnail.create_thumbnail_if_none_in_package(context, resources)
         
     def after_update(self, context, resource):
-        thumbnail.check_and_create_thumbnail_after_update(context, resource)
+        resources = thumbnail.resources_of_containing_package(resource)
+        thumbnail.create_thumbnail_if_none_in_package(context, resources)
                 
     def after_delete(self, context, resources):
-        thumbnail.create_thumbnail_for_last_resource(context, resources)
+        thumbnail.create_thumbnail_if_none_in_package(context, resources)
             
 #IConfigurer 
 
diff --git a/ckanext/odsh/pdf_to_thumbnail/thumbnail.py b/ckanext/odsh/pdf_to_thumbnail/thumbnail.py
index 768cfdbf..7a39dea0 100644
--- a/ckanext/odsh/pdf_to_thumbnail/thumbnail.py
+++ b/ckanext/odsh/pdf_to_thumbnail/thumbnail.py
@@ -7,164 +7,221 @@ from ckan.common import config
 import urllib2
 import requests
 
-import binascii
+from binascii import b2a_hex
 import ckan.plugins.toolkit as toolkit
 import ckan.logic as logic
 #from extension
 #from ckanext.odsh.lib.uploader import raise_validation_error_if_virus_found
 
 log = logging.getLogger(__name__)
- 
 
-def get_filename_from_context(context):
+
+def create_thumbnail(context, resource):
+    '''
+    main entry point into this module
+    this function is called from pdf_to_thumbnail.plugin
+    '''
+    old_filename = _get_filename_from_context(context)
+    url_type = resource.get('url_type')
+    if url_type == 'upload':
+        is_PDF, filename = _create_thumbnail_from_memory(resource, old_filename)
+    else:
+        is_PDF, filename = (False, None)
+    return is_PDF, filename  
+
+
+def _get_filename_from_context(context):
     package = context.get('package')
     package_id = package.id
-    package= toolkit.get_action('package_show')(context, {'id': package_id})
+    package= toolkit.get_action('package_show')(None, {'id': package_id})
     thumbnail = package.get('thumbnail') 
     return  thumbnail
 
-def get_filepath_for_thumbnail(filename):
-    if filename:
-        return config.get('ckan.storage_path') + "/thumbnail/" + filename
-    return config.get('ckan.storage_path') + "/thumbnail/"
 
-def concatenate_filename(filename):
-    return filename + ".jpg"
+def _create_thumbnail_from_memory(resource, old_filename):
+    filepath = get_resource_path(resource)
+    is_PDF = _is_pdf(filepath)
+    if is_PDF:
+        with open(filepath, 'rb') as file:
+            new_filename = _create_thumbnail_from_file(file)
+        if old_filename:
+            ThumbnailPath.from_filename(old_filename).remove()
+        return is_PDF, new_filename
+    else:
+        return is_PDF, None
+
 
-def get_filepath_to_resource(resource):
+def get_resource_path(resource):
+    # see https://stackoverflow.com/questions/46572402/where-does-ckan-store-the-files-pushed-to-datastore-filestore
     resource_id = resource.get('id')
-    directory = config.get('ckan.storage_path') + '/resources/'
-    #looked up how resources are saved, by locating the keyword resources in the OS 
-    path = directory + resource_id[0:3] + '/' + resource_id[3:6] + '/' +  resource_id[6:]
-    return path
-
-def random_filename():
-    number = binascii.b2a_hex(os.urandom(15))
-    filename = 'thumbnail_picture_' + str(number)    
-    full_filename = concatenate_filename(filename)
-    filepath = get_filepath_for_thumbnail(full_filename)
-    if os.path.exists(filepath):
-        filename = random_filename()
-    return filename
-
-def change_filepath(old_filename):    
-    old_filepath = get_filepath_for_thumbnail(old_filename)
-    new_filename = concatenate_filename(random_filename())
-    new_filepath = get_filepath_for_thumbnail(new_filename)
-    try:
-        os.renames(old_filepath, new_filepath)
-        return new_filename
-    except OSError:
-        log.warning('The file path "{}"  of package was not found.'.format(old_filepath))
-     
+    filepath = os.path.join(
+        config.get('ckan.storage_path'),
+        'resources',
+        resource_id[0:3],
+        resource_id[3:6],
+        resource_id[6:]
+    )
+    return filepath
+
+
+def _is_pdf(filepath):
+    file_type = magic.from_file(filepath, mime = True)
+    return file_type == 'application/pdf'
+
 
-def create_thumbnail_from_file(file, old_filename):
+def _create_thumbnail_from_file(file):
     width = config.get('ckan.thumbnail.size.width', 410)
-    filename = random_filename()
+    new_thumbnail = ThumbnailPath.from_unique_random_name()
     file.seek(0)
     file_read = file.read()
-    directory = get_filepath_for_thumbnail('')
-    if old_filename:
-        old_filepath = get_filepath_for_thumbnail(concatenate_filename(old_filename))
-        if os.path.exists(old_filepath):
-            os.remove(old_filepath)
-    convert_from_bytes(file_read,
-                       size=(width, None),
-                       output_folder=directory,
-                       output_file=filename,
-                       single_file=True,
-                       first_page=0,
-                       last_page=0,
-                       fmt='jpg'
-                       )
-    return concatenate_filename(filename)
-
-
-def create_thumbnail_from_url(resource, old_filename):
-    resource_url = resource.get('url')
-    request = urllib2.Request(resource_url)
-    response = urllib2.urlopen(request, timeout = 100000) 
-    
-    
-    if response.code == 200:
-        filetowrite = response.read()
-        # function is set to private in ckanext.odsh.lib.uploader
-        # raise_validation_error_if_virus_found(filetowrite, response.read())
-        file_type = magic.from_buffer(response.read(), mime = True)
-        header = response.headers
-        resource_size = header.get('Content-Length')
-        
-            
-        max_available_memory = config.get('ckan.max_available_memory', 250000000)  #In Bytes ca. 250 MB
-        with tempfile.SpooledTemporaryFile(max_size=max_available_memory) as file:
-            file.write(filetowrite)
-            
-            new_filename = create_thumbnail_from_file(file, old_filename)        
-            return True, new_filename
-
-def create_thumbnail_from_memory(resource, old_filename):
-    path = get_filepath_to_resource(resource)
-    file_type = magic.from_file(path, mime = True)
-    if file_type == 'application/pdf':
-        with open(path, 'rb') as file:
-            new_filename = create_thumbnail_from_file(file, old_filename)
-        is_PDF = True
-        return is_PDF, new_filename
-    else:
-        is_PDF = False
-        return is_PDF,  None
+    convert_from_bytes(
+        file_read,
+        size=(width, None),
+        output_folder=new_thumbnail.folder,
+        output_file=new_thumbnail.filename,
+        single_file=True,
+        first_page=0,
+        last_page=0,
+        fmt='jpg'
+    )
+    return new_thumbnail.filename_with_extension
+
+
+def thumbnail_folder():
+    return os.path.join(
+        config.get('ckan.storage_path'),
+        'thumbnail',
+    )
+
+
+def rename_thumbnail_to_random_name(old_filename):
+    '''
+    used by pdf_to_thumbnail.action
+    '''
+    old_filepath = ThumbnailPath.from_filename_with_extension(old_filename)
+    new_filepath = ThumbnailPath.from_unique_random_name()
+    try:
+        os.renames(old_filepath.full_filename, new_filepath.full_filename)
+        return new_filepath.filename_with_extension
+    except OSError:
+        log.warning('The file path "{}"  of package was not found.'.format(old_filepath))
+     
 
 def remove_thumbnail(context):
-    old_filename = get_filename_from_context(context)
+    '''
+    used by pdf_to_thumbnail.action
+    '''
+    old_filename = _get_filename_from_context(context)
     if old_filename:
-        old_filepath = get_filepath_for_thumbnail(old_filename)
-        if os.path.exists(old_filepath):
-            os.remove(old_filepath)
+        ThumbnailPath.from_filename_with_extension(old_filename).remove()
 
-def create_thumbnail(context, resource):
-    log.debug('create_thumbnail')
-    old_filename = get_filename_from_context(context)
-    url_type = resource.get('url_type')
-    if url_type == 'upload':
-        is_PDF,  filename = create_thumbnail_from_memory(resource, old_filename)
-    else:
-        is_PDF,  filename = create_thumbnail_from_url(resource, old_filename)
-    return is_PDF,  filename   
 
-def check_and_create_thumbnail_after_update(context, resource):
-    log.debug('check_and_create_thumbnail_after_update')
+def resources_of_containing_package(resource):
+    #todo: change arg order
+    '''
+    used by pdf_to_thumbnail.plugin
+    '''
     package_id = resource.get('package_id')
-    package = toolkit.get_action('package_show')(context, {'id': package_id})
+    package = toolkit.get_action('package_show')(None, {'id': package_id})
     resources = package.get('resources')
-    if len(resources) > 0:
-        last_resource = resources.pop()
-        last_resource_id = last_resource.get('id')
-        resource_id = resource.get('id')
-    if last_resource_id == resource_id and resource.get('url_type') != 'upload':
-        is_PDF,  filename = create_thumbnail(context, resource)
-        if is_PDF:
-            write_thumbnail_into_package(context, resource, filename)  
+    return resources
         
 
-def create_thumbnail_for_last_resource(context, resources):
-    if len(resources) > 0:
-        last_resource = resources.pop()
-        is_PDF, filename = create_thumbnail(context, last_resource)
-        if not is_PDF:
-            create_thumbnail_for_last_resource(context, resources)
+def create_thumbnail_if_none_in_package(context, resources):
+    '''
+    used by pdf_to_thumbnail.plugin
+    loops through a package's resources in the order they have been uploaded
+    and for each tries to create a thumbnail until it succeeds.
+    If the package already has a thumbnail the creation step is skipped
+    '''
+    package_dict = _get_package_dict_from_context(context)
+    if not _has_thumbnail(package_dict):
+        any(_try_create_thumbnail(context, r) for r in resources)
+
+
+def _get_package_dict_from_context(context):
+    package_id = context.get('package').id
+    package_dict = toolkit.get_action('package_show')(None, {'id': package_id})
+    return package_dict
+
+
+def _has_thumbnail(package_dict):
+    thumbnail = package_dict.get('thumbnail')
+    return bool(thumbnail)
+
+
+def _try_create_thumbnail(context, resource):
+    is_PDF, filename = create_thumbnail(context, resource)
+    success = is_PDF
+    if success:
+        _write_thumbnail_into_package(context, filename)
+    return success
+
+
+def _write_thumbnail_into_package(context, filename):
+    package_dict = _get_package_dict_from_context(context)
+    if filename:
+        package_dict.update({'thumbnail': filename})
+    toolkit.get_action('package_update')(None, package_dict)
+    
+
+class ThumbnailPath(object):
+    '''
+    utility class to manage the path of thumbnail pictures
+    '''
+
+    def __init__(self, folder, filename, extension):
+        self.folder = folder
+        self.filename = filename
+        self.extension = extension
+    
+    _EXTENSION = '.jpg'
+    
+    @staticmethod
+    def from_filename(filename):
+        '''
+        filename without extension (i.e. '.jpg')
+        '''
+        return ThumbnailPath(thumbnail_folder(), filename, ThumbnailPath._EXTENSION)
+    
+    @staticmethod
+    def from_filename_with_extension(filename_with_extension):
+        '''
+        limited to one dot in filename
+        '''
+        tokens = filename_with_extension.split('.')
+        if len(tokens) == 1:
+            filename = filename_with_extension
+            extension = ''
         else:
-            write_thumbnail_into_package(context, last_resource, filename)
-    else:
-        remove_thumbnail(context)
-        package = context.get('package')
-        package_id = package.id
-        package= toolkit.get_action('package_show')(context, {'id': package_id})
-        package.update({'thumbnail': None})
-        toolkit.get_action('package_update')(context, package)
-
-def write_thumbnail_into_package(context, resource, filename):
-        package_id = resource.get('package_id')
-        package = toolkit.get_action('package_show')(context, {'id': package_id})
-        if filename:
-            package.update({'thumbnail': filename})
-        toolkit.get_action('package_update')(context, package)
+            filename = '.'.join(tokens[:-1])
+            extension = '.'.join(['', tokens[-1]])
+        return ThumbnailPath(thumbnail_folder(), filename, extension)
+
+    @staticmethod
+    def from_unique_random_name():
+        thumbnail_path = ThumbnailPath._from_random_name()
+        if thumbnail_path.exists():
+            return ThumbnailPath.from_unique_random_name()
+        return thumbnail_path
+    
+    @staticmethod
+    def _from_random_name():
+        number = b2a_hex(os.urandom(15))
+        filename = 'thumbnail_picture_' + str(number)
+        return ThumbnailPath.from_filename(filename)
+    
+    @property
+    def filename_with_extension(self):
+        return self.filename + self.extension
+    
+    @property
+    def full_filename(self):
+        return os.path.join(self.folder, self.filename_with_extension)
+    
+    def exists(self):
+        return os.path.exists(self.full_filename)
+    
+    def remove(self):
+        if os.path.exists(self.full_filename):
+            os.remove(self.full_filename)
-- 
GitLab