Skip to content
Snippets Groups Projects
Verified Commit 152647cb authored by Jesper Zedlitz's avatar Jesper Zedlitz
Browse files

store size of the distribution

ruff format
parent b072c066
Branches
Tags
1 merge request!1Update Formats, Dependencies, and Dockerfile Configuration
...@@ -174,8 +174,7 @@ class DcatCatalogCheck: ...@@ -174,8 +174,7 @@ class DcatCatalogCheck:
format = resource["format"].lower() format = resource["format"].lower()
try: try:
# dynamically import the corresponding module for the format # dynamically import the corresponding module for the format
format_check_module = importlib.import_module( format_check_module = importlib.import_module(f"formats.{format}_format")
f"formats.{format}_format")
except ModuleNotFoundError: except ModuleNotFoundError:
format_check_module = None format_check_module = None
...@@ -194,6 +193,9 @@ class DcatCatalogCheck: ...@@ -194,6 +193,9 @@ class DcatCatalogCheck:
if "etag" in response.headers: if "etag" in response.headers:
resource["etag"] = response.headers["etag"] resource["etag"] = response.headers["etag"]
if "content-length" in response.headers:
resource["size"] = response.headers["content-length"]
except requests.exceptions.RequestException as err: except requests.exceptions.RequestException as err:
# Handle connection, timeout, or other request errors # Handle connection, timeout, or other request errors
resource["accessible"] = False resource["accessible"] = False
...@@ -210,8 +212,7 @@ class DcatCatalogCheck: ...@@ -210,8 +212,7 @@ class DcatCatalogCheck:
# write the content of the HTTP response into a temporary file # write the content of the HTTP response into a temporary file
original_file_name = url.split("/")[-1] original_file_name = url.split("/")[-1]
suffix = original_file_name.split( suffix = original_file_name.split(".")[-1] if "." in original_file_name else ""
".")[-1] if "." in original_file_name else ""
with tempfile.NamedTemporaryFile( with tempfile.NamedTemporaryFile(
delete=False, suffix="." + suffix delete=False, suffix="." + suffix
) as temp_file: ) as temp_file:
...@@ -234,8 +235,7 @@ class DcatCatalogCheck: ...@@ -234,8 +235,7 @@ class DcatCatalogCheck:
decompressor = decompressors.get(resource["mimetype"]) decompressor = decompressors.get(resource["mimetype"])
if not decompressor: if not decompressor:
self.logger.warning( self.logger.warning(f"Unknown compression {resource['mimetype']}.")
f"Unknown compression {resource['mimetype']}.")
else: else:
with tempfile.NamedTemporaryFile(delete=False) as decompressed_file: with tempfile.NamedTemporaryFile(delete=False) as decompressed_file:
with decompressor.open(temp_file.name, "rb") as compressed_file: with decompressor.open(temp_file.name, "rb") as compressed_file:
...@@ -246,8 +246,7 @@ class DcatCatalogCheck: ...@@ -246,8 +246,7 @@ class DcatCatalogCheck:
resource["mimetype"] = self._guess_mime_type(temp_file.name) resource["mimetype"] = self._guess_mime_type(temp_file.name)
if self._is_container(resource["mimetype"], resource["format"]): if self._is_container(resource["mimetype"], resource["format"]):
self._check_container_file( self._check_container_file(resource, temp_file, format_check_module)
resource, temp_file, format_check_module)
else: else:
self._check_single_file(resource, temp_file, format_check_module) self._check_single_file(resource, temp_file, format_check_module)
...@@ -275,8 +274,7 @@ class DcatCatalogCheck: ...@@ -275,8 +274,7 @@ class DcatCatalogCheck:
temp_file.write(file.read()) temp_file.write(file.read())
temp_file.flush() temp_file.flush()
resource["mimetype"] = self._guess_mime_type( resource["mimetype"] = self._guess_mime_type(temp_file.name)
temp_file.name)
validation_result = ( validation_result = (
validation_result validation_result
and self._check_single_file( and self._check_single_file(
...@@ -290,14 +288,12 @@ class DcatCatalogCheck: ...@@ -290,14 +288,12 @@ class DcatCatalogCheck:
return contains_at_least_one_relevant_file and validation_result return contains_at_least_one_relevant_file and validation_result
else: else:
self.logger.error( self.logger.error(f"Unsupported container format {resource['mimetype']}")
f"Unsupported container format {resource['mimetype']}")
def _check_single_file(self, resource, temp_file, format_check_module): def _check_single_file(self, resource, temp_file, format_check_module):
if format_check_module: if format_check_module:
# call the function `process` that is defined in every modul # call the function `process` that is defined in every modul
resource["valid"] = format_check_module.is_valid( resource["valid"] = format_check_module.is_valid(resource, temp_file)
resource, temp_file)
else: else:
# There is no specialized check for the specified format. # There is no specialized check for the specified format.
# Does the returned MIME type match the promised format? # Does the returned MIME type match the promised format?
...@@ -322,8 +318,7 @@ class DcatCatalogCheck: ...@@ -322,8 +318,7 @@ class DcatCatalogCheck:
): ):
hash_algorithm = hashlib.md5() hash_algorithm = hashlib.md5()
else: else:
print( print(f"WARNING: unknown checksum algorithm {algo_name}", file=sys.stderr)
f"WARNING: unknown checksum algorithm {algo_name}", file=sys.stderr)
return return
with open(temp_file.name, "rb") as f: with open(temp_file.name, "rb") as f:
...@@ -418,8 +413,7 @@ class DcatCatalogCheck: ...@@ -418,8 +413,7 @@ class DcatCatalogCheck:
publisher = graph.value(dataset, DCTERMS.publisher) publisher = graph.value(dataset, DCTERMS.publisher)
if not publisher: if not publisher:
self.logger.warning( self.logger.warning(f"Publisher not found for dataset: {dataset}")
f"Publisher not found for dataset: {dataset}")
return None return None
# Attempt to get the publisher's name # Attempt to get the publisher's name
...@@ -433,8 +427,7 @@ class DcatCatalogCheck: ...@@ -433,8 +427,7 @@ class DcatCatalogCheck:
except Exception as e: except Exception as e:
# Log any unexpected errors # Log any unexpected errors
self.logger.error( self.logger.error(f"Error retrieving publisher for dataset {dataset}: {e}")
f"Error retrieving publisher for dataset {dataset}: {e}")
return None return None
def _process_datasets(self, datasets, g): def _process_datasets(self, datasets, g):
...@@ -459,8 +452,7 @@ class DcatCatalogCheck: ...@@ -459,8 +452,7 @@ class DcatCatalogCheck:
url = str(resource["url"]) url = str(resource["url"])
if self._needs_check(url): if self._needs_check(url):
checksum_resource = g.value( checksum_resource = g.value(distribution, SPDX.checksum)
distribution, SPDX.checksum)
if checksum_resource: if checksum_resource:
resource["checksum_algorithm"] = str( resource["checksum_algorithm"] = str(
g.value(checksum_resource, SPDX.algorithm) g.value(checksum_resource, SPDX.algorithm)
...@@ -481,7 +473,8 @@ class DcatCatalogCheck: ...@@ -481,7 +473,8 @@ class DcatCatalogCheck:
def read_previous_results(self, file_path): def read_previous_results(self, file_path):
if not os.path.exists(file_path): if not os.path.exists(file_path):
self.logger.warning( self.logger.warning(
f"File '{file_path}' does not exist. No previous results loaded.") f"File '{file_path}' does not exist. No previous results loaded."
)
return return
loaded_count = 0 loaded_count = 0
...@@ -500,7 +493,8 @@ class DcatCatalogCheck: ...@@ -500,7 +493,8 @@ class DcatCatalogCheck:
url = json_object.get("url") url = json_object.get("url")
if not url: if not url:
self.logger.warning( self.logger.warning(
f"Line {line_number} is missing 'url': {line}") f"Line {line_number} is missing 'url': {line}"
)
skipped_count += 1 skipped_count += 1
continue continue
...@@ -508,12 +502,12 @@ class DcatCatalogCheck: ...@@ -508,12 +502,12 @@ class DcatCatalogCheck:
loaded_count += 1 loaded_count += 1
except json.JSONDecodeError as e: except json.JSONDecodeError as e:
self.logger.error( self.logger.error(f"Invalid JSON at line {line_number}: {e}")
f"Invalid JSON at line {line_number}: {e}")
skipped_count += 1 skipped_count += 1
self.logger.info( self.logger.info(
f"Loaded {loaded_count} results from '{file_path}', skipped {skipped_count} lines.") f"Loaded {loaded_count} results from '{file_path}', skipped {skipped_count} lines."
)
def read_dcat_catalog(self, url): def read_dcat_catalog(self, url):
while url: while url:
...@@ -536,8 +530,7 @@ class DcatCatalogCheck: ...@@ -536,8 +530,7 @@ class DcatCatalogCheck:
self._process_datasets(datasets, g) self._process_datasets(datasets, g)
paged_collection = g.value( paged_collection = g.value(predicate=RDF.type, object=HYDRA.PagedCollection)
predicate=RDF.type, object=HYDRA.PagedCollection)
next_page = g.value(paged_collection, HYDRA.nextPage) next_page = g.value(paged_collection, HYDRA.nextPage)
url = str(next_page) if next_page else None url = str(next_page) if next_page else None
...@@ -562,12 +555,9 @@ if __name__ == "__main__": ...@@ -562,12 +555,9 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--url", help="DCAT catalog URL") parser.add_argument("--url", help="DCAT catalog URL")
parser.add_argument("--log_file", help="Log file path") parser.add_argument("--log_file", help="Log file path")
parser.add_argument( parser.add_argument("--results", help="File from which the results are loaded")
"--results", help="File from which the results are loaded") parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")
parser.add_argument("--verbose", action="store_true", parser.add_argument("--debug", action="store_true", help="Enable debug logging")
help="Enable verbose logging")
parser.add_argument("--debug", action="store_true",
help="Enable debug logging")
parser.add_argument( parser.add_argument(
"--recheck", "--recheck",
action="store_true", action="store_true",
...@@ -578,8 +568,7 @@ if __name__ == "__main__": ...@@ -578,8 +568,7 @@ if __name__ == "__main__":
action="store_true", action="store_true",
help="Just check new entries from the catalog. Do not re-check existing results.", help="Just check new entries from the catalog. Do not re-check existing results.",
) )
parser.add_argument( parser.add_argument("--check-format", help="Only check the specified format")
"--check-format", help="Only check the specified format")
parser.add_argument( parser.add_argument(
"--force-check-format", "--force-check-format",
help="Check distributinons with the specified format regardless of previous results", help="Check distributinons with the specified format regardless of previous results",
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment