diff --git a/dcat_catalog_check.py b/dcat_catalog_check.py index 91dde8c09e5edccafbe07009c9e48409432f1bf8..071e9ba0bab17c6f6bffc6f23ebdafc611212d1e 100755 --- a/dcat_catalog_check.py +++ b/dcat_catalog_check.py @@ -174,8 +174,7 @@ class DcatCatalogCheck: format = resource["format"].lower() try: # dynamically import the corresponding module for the format - format_check_module = importlib.import_module( - f"formats.{format}_format") + format_check_module = importlib.import_module(f"formats.{format}_format") except ModuleNotFoundError: format_check_module = None @@ -194,6 +193,9 @@ class DcatCatalogCheck: if "etag" in response.headers: resource["etag"] = response.headers["etag"] + if "content-length" in response.headers: + resource["size"] = response.headers["content-length"] + except requests.exceptions.RequestException as err: # Handle connection, timeout, or other request errors resource["accessible"] = False @@ -210,8 +212,7 @@ class DcatCatalogCheck: # write the content of the HTTP response into a temporary file original_file_name = url.split("/")[-1] - suffix = original_file_name.split( - ".")[-1] if "." in original_file_name else "" + suffix = original_file_name.split(".")[-1] if "." in original_file_name else "" with tempfile.NamedTemporaryFile( delete=False, suffix="." + suffix ) as temp_file: @@ -234,8 +235,7 @@ class DcatCatalogCheck: decompressor = decompressors.get(resource["mimetype"]) if not decompressor: - self.logger.warning( - f"Unknown compression {resource['mimetype']}.") + self.logger.warning(f"Unknown compression {resource['mimetype']}.") else: with tempfile.NamedTemporaryFile(delete=False) as decompressed_file: with decompressor.open(temp_file.name, "rb") as compressed_file: @@ -246,8 +246,7 @@ class DcatCatalogCheck: resource["mimetype"] = self._guess_mime_type(temp_file.name) if self._is_container(resource["mimetype"], resource["format"]): - self._check_container_file( - resource, temp_file, format_check_module) + self._check_container_file(resource, temp_file, format_check_module) else: self._check_single_file(resource, temp_file, format_check_module) @@ -275,8 +274,7 @@ class DcatCatalogCheck: temp_file.write(file.read()) temp_file.flush() - resource["mimetype"] = self._guess_mime_type( - temp_file.name) + resource["mimetype"] = self._guess_mime_type(temp_file.name) validation_result = ( validation_result and self._check_single_file( @@ -290,14 +288,12 @@ class DcatCatalogCheck: return contains_at_least_one_relevant_file and validation_result else: - self.logger.error( - f"Unsupported container format {resource['mimetype']}") + self.logger.error(f"Unsupported container format {resource['mimetype']}") def _check_single_file(self, resource, temp_file, format_check_module): if format_check_module: # call the function `process` that is defined in every modul - resource["valid"] = format_check_module.is_valid( - resource, temp_file) + resource["valid"] = format_check_module.is_valid(resource, temp_file) else: # There is no specialized check for the specified format. # Does the returned MIME type match the promised format? @@ -322,8 +318,7 @@ class DcatCatalogCheck: ): hash_algorithm = hashlib.md5() else: - print( - f"WARNING: unknown checksum algorithm {algo_name}", file=sys.stderr) + print(f"WARNING: unknown checksum algorithm {algo_name}", file=sys.stderr) return with open(temp_file.name, "rb") as f: @@ -418,8 +413,7 @@ class DcatCatalogCheck: publisher = graph.value(dataset, DCTERMS.publisher) if not publisher: - self.logger.warning( - f"Publisher not found for dataset: {dataset}") + self.logger.warning(f"Publisher not found for dataset: {dataset}") return None # Attempt to get the publisher's name @@ -433,8 +427,7 @@ class DcatCatalogCheck: except Exception as e: # Log any unexpected errors - self.logger.error( - f"Error retrieving publisher for dataset {dataset}: {e}") + self.logger.error(f"Error retrieving publisher for dataset {dataset}: {e}") return None def _process_datasets(self, datasets, g): @@ -459,8 +452,7 @@ class DcatCatalogCheck: url = str(resource["url"]) if self._needs_check(url): - checksum_resource = g.value( - distribution, SPDX.checksum) + checksum_resource = g.value(distribution, SPDX.checksum) if checksum_resource: resource["checksum_algorithm"] = str( g.value(checksum_resource, SPDX.algorithm) @@ -481,7 +473,8 @@ class DcatCatalogCheck: def read_previous_results(self, file_path): if not os.path.exists(file_path): self.logger.warning( - f"File '{file_path}' does not exist. No previous results loaded.") + f"File '{file_path}' does not exist. No previous results loaded." + ) return loaded_count = 0 @@ -500,7 +493,8 @@ class DcatCatalogCheck: url = json_object.get("url") if not url: self.logger.warning( - f"Line {line_number} is missing 'url': {line}") + f"Line {line_number} is missing 'url': {line}" + ) skipped_count += 1 continue @@ -508,12 +502,12 @@ class DcatCatalogCheck: loaded_count += 1 except json.JSONDecodeError as e: - self.logger.error( - f"Invalid JSON at line {line_number}: {e}") + self.logger.error(f"Invalid JSON at line {line_number}: {e}") skipped_count += 1 self.logger.info( - f"Loaded {loaded_count} results from '{file_path}', skipped {skipped_count} lines.") + f"Loaded {loaded_count} results from '{file_path}', skipped {skipped_count} lines." + ) def read_dcat_catalog(self, url): while url: @@ -536,8 +530,7 @@ class DcatCatalogCheck: self._process_datasets(datasets, g) - paged_collection = g.value( - predicate=RDF.type, object=HYDRA.PagedCollection) + paged_collection = g.value(predicate=RDF.type, object=HYDRA.PagedCollection) next_page = g.value(paged_collection, HYDRA.nextPage) url = str(next_page) if next_page else None @@ -562,12 +555,9 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--url", help="DCAT catalog URL") parser.add_argument("--log_file", help="Log file path") - parser.add_argument( - "--results", help="File from which the results are loaded") - parser.add_argument("--verbose", action="store_true", - help="Enable verbose logging") - parser.add_argument("--debug", action="store_true", - help="Enable debug logging") + parser.add_argument("--results", help="File from which the results are loaded") + parser.add_argument("--verbose", action="store_true", help="Enable verbose logging") + parser.add_argument("--debug", action="store_true", help="Enable debug logging") parser.add_argument( "--recheck", action="store_true", @@ -578,8 +568,7 @@ if __name__ == "__main__": action="store_true", help="Just check new entries from the catalog. Do not re-check existing results.", ) - parser.add_argument( - "--check-format", help="Only check the specified format") + parser.add_argument("--check-format", help="Only check the specified format") parser.add_argument( "--force-check-format", help="Check distributinons with the specified format regardless of previous results",