From 0bd7130aa664b15a6b604e9bd6e7a5916db5209d Mon Sep 17 00:00:00 2001 From: Jesper Zedlitz <jesper@zedlitz.de> Date: Fri, 14 Aug 2020 13:58:40 +0200 Subject: [PATCH] pdf_to_thumbnail aus Transparenzportal --- ckanext/odsh/pdf_to_thumbnail/action.py | 5 +- ckanext/odsh/pdf_to_thumbnail/plugin.py | 12 +- ckanext/odsh/pdf_to_thumbnail/thumbnail.py | 321 ++++++++++++--------- 3 files changed, 197 insertions(+), 141 deletions(-) diff --git a/ckanext/odsh/pdf_to_thumbnail/action.py b/ckanext/odsh/pdf_to_thumbnail/action.py index bd5d7130..d77845be 100644 --- a/ckanext/odsh/pdf_to_thumbnail/action.py +++ b/ckanext/odsh/pdf_to_thumbnail/action.py @@ -4,8 +4,7 @@ import ckan.lib.helpers as helpers from ckan.logic.action.update import package_update from ckan.logic.action.delete import package_delete -#from thumbnail -import thumbnail as thumbnail +import thumbnail def before_package_delete(context, package_id_dict): @@ -23,7 +22,7 @@ def before_package_update(context, pkg_dict): old_filename = package.get('thumbnail') if old_filename: if str(old_private) != str(new_private): - new_filename = thumbnail.change_filepath(old_filename) + new_filename = thumbnail.rename_thumbnail_to_random_name(old_filename) pkg_dict['extras'].append({'key': 'thumbnail', 'value': new_filename}) elif not pkg_dict.get('thumbnail'): pkg_dict['extras'].append({'key': 'thumbnail', 'value': old_filename}) diff --git a/ckanext/odsh/pdf_to_thumbnail/plugin.py b/ckanext/odsh/pdf_to_thumbnail/plugin.py index 71523599..ecd4fdef 100644 --- a/ckanext/odsh/pdf_to_thumbnail/plugin.py +++ b/ckanext/odsh/pdf_to_thumbnail/plugin.py @@ -1,6 +1,5 @@ import os - #from ckan import ckan.plugins as plugins @@ -21,15 +20,16 @@ class ThumbnailPlugin(plugins.SingletonPlugin): #IResourceController - def after_create(self, context, resource): - _, filename = thumbnail.create_thumbnail(context, resource) - thumbnail.write_thumbnail_into_package(context, resource, filename) + def after_create(self, context, resource): + resources = thumbnail.resources_of_containing_package(resource) + thumbnail.create_thumbnail_if_none_in_package(context, resources) def after_update(self, context, resource): - thumbnail.check_and_create_thumbnail_after_update(context, resource) + resources = thumbnail.resources_of_containing_package(resource) + thumbnail.create_thumbnail_if_none_in_package(context, resources) def after_delete(self, context, resources): - thumbnail.create_thumbnail_for_last_resource(context, resources) + thumbnail.create_thumbnail_if_none_in_package(context, resources) #IConfigurer diff --git a/ckanext/odsh/pdf_to_thumbnail/thumbnail.py b/ckanext/odsh/pdf_to_thumbnail/thumbnail.py index 768cfdbf..7a39dea0 100644 --- a/ckanext/odsh/pdf_to_thumbnail/thumbnail.py +++ b/ckanext/odsh/pdf_to_thumbnail/thumbnail.py @@ -7,164 +7,221 @@ from ckan.common import config import urllib2 import requests -import binascii +from binascii import b2a_hex import ckan.plugins.toolkit as toolkit import ckan.logic as logic #from extension #from ckanext.odsh.lib.uploader import raise_validation_error_if_virus_found log = logging.getLogger(__name__) - -def get_filename_from_context(context): + +def create_thumbnail(context, resource): + ''' + main entry point into this module + this function is called from pdf_to_thumbnail.plugin + ''' + old_filename = _get_filename_from_context(context) + url_type = resource.get('url_type') + if url_type == 'upload': + is_PDF, filename = _create_thumbnail_from_memory(resource, old_filename) + else: + is_PDF, filename = (False, None) + return is_PDF, filename + + +def _get_filename_from_context(context): package = context.get('package') package_id = package.id - package= toolkit.get_action('package_show')(context, {'id': package_id}) + package= toolkit.get_action('package_show')(None, {'id': package_id}) thumbnail = package.get('thumbnail') return thumbnail -def get_filepath_for_thumbnail(filename): - if filename: - return config.get('ckan.storage_path') + "/thumbnail/" + filename - return config.get('ckan.storage_path') + "/thumbnail/" -def concatenate_filename(filename): - return filename + ".jpg" +def _create_thumbnail_from_memory(resource, old_filename): + filepath = get_resource_path(resource) + is_PDF = _is_pdf(filepath) + if is_PDF: + with open(filepath, 'rb') as file: + new_filename = _create_thumbnail_from_file(file) + if old_filename: + ThumbnailPath.from_filename(old_filename).remove() + return is_PDF, new_filename + else: + return is_PDF, None + -def get_filepath_to_resource(resource): +def get_resource_path(resource): + # see https://stackoverflow.com/questions/46572402/where-does-ckan-store-the-files-pushed-to-datastore-filestore resource_id = resource.get('id') - directory = config.get('ckan.storage_path') + '/resources/' - #looked up how resources are saved, by locating the keyword resources in the OS - path = directory + resource_id[0:3] + '/' + resource_id[3:6] + '/' + resource_id[6:] - return path - -def random_filename(): - number = binascii.b2a_hex(os.urandom(15)) - filename = 'thumbnail_picture_' + str(number) - full_filename = concatenate_filename(filename) - filepath = get_filepath_for_thumbnail(full_filename) - if os.path.exists(filepath): - filename = random_filename() - return filename - -def change_filepath(old_filename): - old_filepath = get_filepath_for_thumbnail(old_filename) - new_filename = concatenate_filename(random_filename()) - new_filepath = get_filepath_for_thumbnail(new_filename) - try: - os.renames(old_filepath, new_filepath) - return new_filename - except OSError: - log.warning('The file path "{}" of package was not found.'.format(old_filepath)) - + filepath = os.path.join( + config.get('ckan.storage_path'), + 'resources', + resource_id[0:3], + resource_id[3:6], + resource_id[6:] + ) + return filepath + + +def _is_pdf(filepath): + file_type = magic.from_file(filepath, mime = True) + return file_type == 'application/pdf' + -def create_thumbnail_from_file(file, old_filename): +def _create_thumbnail_from_file(file): width = config.get('ckan.thumbnail.size.width', 410) - filename = random_filename() + new_thumbnail = ThumbnailPath.from_unique_random_name() file.seek(0) file_read = file.read() - directory = get_filepath_for_thumbnail('') - if old_filename: - old_filepath = get_filepath_for_thumbnail(concatenate_filename(old_filename)) - if os.path.exists(old_filepath): - os.remove(old_filepath) - convert_from_bytes(file_read, - size=(width, None), - output_folder=directory, - output_file=filename, - single_file=True, - first_page=0, - last_page=0, - fmt='jpg' - ) - return concatenate_filename(filename) - - -def create_thumbnail_from_url(resource, old_filename): - resource_url = resource.get('url') - request = urllib2.Request(resource_url) - response = urllib2.urlopen(request, timeout = 100000) - - - if response.code == 200: - filetowrite = response.read() - # function is set to private in ckanext.odsh.lib.uploader - # raise_validation_error_if_virus_found(filetowrite, response.read()) - file_type = magic.from_buffer(response.read(), mime = True) - header = response.headers - resource_size = header.get('Content-Length') - - - max_available_memory = config.get('ckan.max_available_memory', 250000000) #In Bytes ca. 250 MB - with tempfile.SpooledTemporaryFile(max_size=max_available_memory) as file: - file.write(filetowrite) - - new_filename = create_thumbnail_from_file(file, old_filename) - return True, new_filename - -def create_thumbnail_from_memory(resource, old_filename): - path = get_filepath_to_resource(resource) - file_type = magic.from_file(path, mime = True) - if file_type == 'application/pdf': - with open(path, 'rb') as file: - new_filename = create_thumbnail_from_file(file, old_filename) - is_PDF = True - return is_PDF, new_filename - else: - is_PDF = False - return is_PDF, None + convert_from_bytes( + file_read, + size=(width, None), + output_folder=new_thumbnail.folder, + output_file=new_thumbnail.filename, + single_file=True, + first_page=0, + last_page=0, + fmt='jpg' + ) + return new_thumbnail.filename_with_extension + + +def thumbnail_folder(): + return os.path.join( + config.get('ckan.storage_path'), + 'thumbnail', + ) + + +def rename_thumbnail_to_random_name(old_filename): + ''' + used by pdf_to_thumbnail.action + ''' + old_filepath = ThumbnailPath.from_filename_with_extension(old_filename) + new_filepath = ThumbnailPath.from_unique_random_name() + try: + os.renames(old_filepath.full_filename, new_filepath.full_filename) + return new_filepath.filename_with_extension + except OSError: + log.warning('The file path "{}" of package was not found.'.format(old_filepath)) + def remove_thumbnail(context): - old_filename = get_filename_from_context(context) + ''' + used by pdf_to_thumbnail.action + ''' + old_filename = _get_filename_from_context(context) if old_filename: - old_filepath = get_filepath_for_thumbnail(old_filename) - if os.path.exists(old_filepath): - os.remove(old_filepath) + ThumbnailPath.from_filename_with_extension(old_filename).remove() -def create_thumbnail(context, resource): - log.debug('create_thumbnail') - old_filename = get_filename_from_context(context) - url_type = resource.get('url_type') - if url_type == 'upload': - is_PDF, filename = create_thumbnail_from_memory(resource, old_filename) - else: - is_PDF, filename = create_thumbnail_from_url(resource, old_filename) - return is_PDF, filename -def check_and_create_thumbnail_after_update(context, resource): - log.debug('check_and_create_thumbnail_after_update') +def resources_of_containing_package(resource): + #todo: change arg order + ''' + used by pdf_to_thumbnail.plugin + ''' package_id = resource.get('package_id') - package = toolkit.get_action('package_show')(context, {'id': package_id}) + package = toolkit.get_action('package_show')(None, {'id': package_id}) resources = package.get('resources') - if len(resources) > 0: - last_resource = resources.pop() - last_resource_id = last_resource.get('id') - resource_id = resource.get('id') - if last_resource_id == resource_id and resource.get('url_type') != 'upload': - is_PDF, filename = create_thumbnail(context, resource) - if is_PDF: - write_thumbnail_into_package(context, resource, filename) + return resources -def create_thumbnail_for_last_resource(context, resources): - if len(resources) > 0: - last_resource = resources.pop() - is_PDF, filename = create_thumbnail(context, last_resource) - if not is_PDF: - create_thumbnail_for_last_resource(context, resources) +def create_thumbnail_if_none_in_package(context, resources): + ''' + used by pdf_to_thumbnail.plugin + loops through a package's resources in the order they have been uploaded + and for each tries to create a thumbnail until it succeeds. + If the package already has a thumbnail the creation step is skipped + ''' + package_dict = _get_package_dict_from_context(context) + if not _has_thumbnail(package_dict): + any(_try_create_thumbnail(context, r) for r in resources) + + +def _get_package_dict_from_context(context): + package_id = context.get('package').id + package_dict = toolkit.get_action('package_show')(None, {'id': package_id}) + return package_dict + + +def _has_thumbnail(package_dict): + thumbnail = package_dict.get('thumbnail') + return bool(thumbnail) + + +def _try_create_thumbnail(context, resource): + is_PDF, filename = create_thumbnail(context, resource) + success = is_PDF + if success: + _write_thumbnail_into_package(context, filename) + return success + + +def _write_thumbnail_into_package(context, filename): + package_dict = _get_package_dict_from_context(context) + if filename: + package_dict.update({'thumbnail': filename}) + toolkit.get_action('package_update')(None, package_dict) + + +class ThumbnailPath(object): + ''' + utility class to manage the path of thumbnail pictures + ''' + + def __init__(self, folder, filename, extension): + self.folder = folder + self.filename = filename + self.extension = extension + + _EXTENSION = '.jpg' + + @staticmethod + def from_filename(filename): + ''' + filename without extension (i.e. '.jpg') + ''' + return ThumbnailPath(thumbnail_folder(), filename, ThumbnailPath._EXTENSION) + + @staticmethod + def from_filename_with_extension(filename_with_extension): + ''' + limited to one dot in filename + ''' + tokens = filename_with_extension.split('.') + if len(tokens) == 1: + filename = filename_with_extension + extension = '' else: - write_thumbnail_into_package(context, last_resource, filename) - else: - remove_thumbnail(context) - package = context.get('package') - package_id = package.id - package= toolkit.get_action('package_show')(context, {'id': package_id}) - package.update({'thumbnail': None}) - toolkit.get_action('package_update')(context, package) - -def write_thumbnail_into_package(context, resource, filename): - package_id = resource.get('package_id') - package = toolkit.get_action('package_show')(context, {'id': package_id}) - if filename: - package.update({'thumbnail': filename}) - toolkit.get_action('package_update')(context, package) + filename = '.'.join(tokens[:-1]) + extension = '.'.join(['', tokens[-1]]) + return ThumbnailPath(thumbnail_folder(), filename, extension) + + @staticmethod + def from_unique_random_name(): + thumbnail_path = ThumbnailPath._from_random_name() + if thumbnail_path.exists(): + return ThumbnailPath.from_unique_random_name() + return thumbnail_path + + @staticmethod + def _from_random_name(): + number = b2a_hex(os.urandom(15)) + filename = 'thumbnail_picture_' + str(number) + return ThumbnailPath.from_filename(filename) + + @property + def filename_with_extension(self): + return self.filename + self.extension + + @property + def full_filename(self): + return os.path.join(self.folder, self.filename_with_extension) + + def exists(self): + return os.path.exists(self.full_filename) + + def remove(self): + if os.path.exists(self.full_filename): + os.remove(self.full_filename) -- GitLab