Skip to content
Snippets Groups Projects
Commit e7b06056 authored by Thorge Petersen's avatar Thorge Petersen
Browse files

fix: keys in _clear_result() and code formatting

parent ca271b42
No related branches found
No related tags found
1 merge request!1Update Formats, Dependencies, and Dockerfile Configuration
...@@ -154,7 +154,8 @@ class DcatCatalogCheck: ...@@ -154,7 +154,8 @@ class DcatCatalogCheck:
"error", "error",
"etag", "etag",
"http_status", "http_status",
"last_check" "mimetype", "last_check",
"mimetype",
"mimetype_mismatch", "mimetype_mismatch",
"valid", "valid",
]: ]:
...@@ -174,7 +175,8 @@ class DcatCatalogCheck: ...@@ -174,7 +175,8 @@ class DcatCatalogCheck:
format = resource["format"].lower() format = resource["format"].lower()
try: try:
# dynamically import the corresponding module for the format # dynamically import the corresponding module for the format
format_check_module = importlib.import_module(f"formats.{format}_format") format_check_module = importlib.import_module(
f"formats.{format}_format")
except ModuleNotFoundError: except ModuleNotFoundError:
format_check_module = None format_check_module = None
...@@ -212,7 +214,8 @@ class DcatCatalogCheck: ...@@ -212,7 +214,8 @@ class DcatCatalogCheck:
# write the content of the HTTP response into a temporary file # write the content of the HTTP response into a temporary file
original_file_name = url.split("/")[-1] original_file_name = url.split("/")[-1]
suffix = original_file_name.split(".")[-1] if "." in original_file_name else "" suffix = original_file_name.split(
".")[-1] if "." in original_file_name else ""
with tempfile.NamedTemporaryFile( with tempfile.NamedTemporaryFile(
delete=False, suffix="." + suffix delete=False, suffix="." + suffix
) as temp_file: ) as temp_file:
...@@ -235,7 +238,8 @@ class DcatCatalogCheck: ...@@ -235,7 +238,8 @@ class DcatCatalogCheck:
decompressor = decompressors.get(resource["mimetype"]) decompressor = decompressors.get(resource["mimetype"])
if not decompressor: if not decompressor:
self.logger.warning(f"Unknown compression {resource['mimetype']}.") self.logger.warning(
f"Unknown compression {resource['mimetype']}.")
else: else:
with tempfile.NamedTemporaryFile(delete=False) as decompressed_file: with tempfile.NamedTemporaryFile(delete=False) as decompressed_file:
with decompressor.open(temp_file.name, "rb") as compressed_file: with decompressor.open(temp_file.name, "rb") as compressed_file:
...@@ -246,7 +250,8 @@ class DcatCatalogCheck: ...@@ -246,7 +250,8 @@ class DcatCatalogCheck:
resource["mimetype"] = self._guess_mime_type(temp_file.name) resource["mimetype"] = self._guess_mime_type(temp_file.name)
if self._is_container(resource["mimetype"], resource["format"]): if self._is_container(resource["mimetype"], resource["format"]):
self._check_container_file(resource, temp_file, format_check_module) self._check_container_file(
resource, temp_file, format_check_module)
else: else:
self._check_single_file(resource, temp_file, format_check_module) self._check_single_file(resource, temp_file, format_check_module)
...@@ -274,7 +279,8 @@ class DcatCatalogCheck: ...@@ -274,7 +279,8 @@ class DcatCatalogCheck:
temp_file.write(file.read()) temp_file.write(file.read())
temp_file.flush() temp_file.flush()
resource["mimetype"] = self._guess_mime_type(temp_file.name) resource["mimetype"] = self._guess_mime_type(
temp_file.name)
validation_result = ( validation_result = (
validation_result validation_result
and self._check_single_file( and self._check_single_file(
...@@ -288,12 +294,14 @@ class DcatCatalogCheck: ...@@ -288,12 +294,14 @@ class DcatCatalogCheck:
return contains_at_least_one_relevant_file and validation_result return contains_at_least_one_relevant_file and validation_result
else: else:
self.logger.error(f"Unsupported container format {resource['mimetype']}") self.logger.error(
f"Unsupported container format {resource['mimetype']}")
def _check_single_file(self, resource, temp_file, format_check_module): def _check_single_file(self, resource, temp_file, format_check_module):
if format_check_module: if format_check_module:
# call the function `process` that is defined in every modul # call the function `process` that is defined in every modul
resource["valid"] = format_check_module.is_valid(resource, temp_file) resource["valid"] = format_check_module.is_valid(
resource, temp_file)
else: else:
# There is no specialized check for the specified format. # There is no specialized check for the specified format.
# Does the returned MIME type match the promised format? # Does the returned MIME type match the promised format?
...@@ -318,7 +326,8 @@ class DcatCatalogCheck: ...@@ -318,7 +326,8 @@ class DcatCatalogCheck:
): ):
hash_algorithm = hashlib.md5() hash_algorithm = hashlib.md5()
else: else:
print(f"WARNING: unknown checksum algorithm {algo_name}", file=sys.stderr) print(
f"WARNING: unknown checksum algorithm {algo_name}", file=sys.stderr)
return return
with open(temp_file.name, "rb") as f: with open(temp_file.name, "rb") as f:
...@@ -413,7 +422,8 @@ class DcatCatalogCheck: ...@@ -413,7 +422,8 @@ class DcatCatalogCheck:
publisher = graph.value(dataset, DCTERMS.publisher) publisher = graph.value(dataset, DCTERMS.publisher)
if not publisher: if not publisher:
self.logger.warning(f"Publisher not found for dataset: {dataset}") self.logger.warning(
f"Publisher not found for dataset: {dataset}")
return None return None
# Attempt to get the publisher's name # Attempt to get the publisher's name
...@@ -427,7 +437,8 @@ class DcatCatalogCheck: ...@@ -427,7 +437,8 @@ class DcatCatalogCheck:
except Exception as e: except Exception as e:
# Log any unexpected errors # Log any unexpected errors
self.logger.error(f"Error retrieving publisher for dataset {dataset}: {e}") self.logger.error(
f"Error retrieving publisher for dataset {dataset}: {e}")
return None return None
def _process_datasets(self, datasets, g): def _process_datasets(self, datasets, g):
...@@ -452,7 +463,8 @@ class DcatCatalogCheck: ...@@ -452,7 +463,8 @@ class DcatCatalogCheck:
url = str(resource["url"]) url = str(resource["url"])
if self._needs_check(url): if self._needs_check(url):
checksum_resource = g.value(distribution, SPDX.checksum) checksum_resource = g.value(
distribution, SPDX.checksum)
if checksum_resource: if checksum_resource:
resource["checksum_algorithm"] = str( resource["checksum_algorithm"] = str(
g.value(checksum_resource, SPDX.algorithm) g.value(checksum_resource, SPDX.algorithm)
...@@ -502,7 +514,8 @@ class DcatCatalogCheck: ...@@ -502,7 +514,8 @@ class DcatCatalogCheck:
loaded_count += 1 loaded_count += 1
except json.JSONDecodeError as e: except json.JSONDecodeError as e:
self.logger.error(f"Invalid JSON at line {line_number}: {e}") self.logger.error(
f"Invalid JSON at line {line_number}: {e}")
skipped_count += 1 skipped_count += 1
self.logger.info( self.logger.info(
...@@ -530,7 +543,8 @@ class DcatCatalogCheck: ...@@ -530,7 +543,8 @@ class DcatCatalogCheck:
self._process_datasets(datasets, g) self._process_datasets(datasets, g)
paged_collection = g.value(predicate=RDF.type, object=HYDRA.PagedCollection) paged_collection = g.value(
predicate=RDF.type, object=HYDRA.PagedCollection)
next_page = g.value(paged_collection, HYDRA.nextPage) next_page = g.value(paged_collection, HYDRA.nextPage)
url = str(next_page) if next_page else None url = str(next_page) if next_page else None
...@@ -555,9 +569,12 @@ if __name__ == "__main__": ...@@ -555,9 +569,12 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--url", help="DCAT catalog URL") parser.add_argument("--url", help="DCAT catalog URL")
parser.add_argument("--log_file", help="Log file path") parser.add_argument("--log_file", help="Log file path")
parser.add_argument("--results", help="File from which the results are loaded") parser.add_argument(
parser.add_argument("--verbose", action="store_true", help="Enable verbose logging") "--results", help="File from which the results are loaded")
parser.add_argument("--debug", action="store_true", help="Enable debug logging") parser.add_argument("--verbose", action="store_true",
help="Enable verbose logging")
parser.add_argument("--debug", action="store_true",
help="Enable debug logging")
parser.add_argument( parser.add_argument(
"--recheck", "--recheck",
action="store_true", action="store_true",
...@@ -568,7 +585,8 @@ if __name__ == "__main__": ...@@ -568,7 +585,8 @@ if __name__ == "__main__":
action="store_true", action="store_true",
help="Just check new entries from the catalog. Do not re-check existing results.", help="Just check new entries from the catalog. Do not re-check existing results.",
) )
parser.add_argument("--check-format", help="Only check the specified format") parser.add_argument(
"--check-format", help="Only check the specified format")
parser.add_argument( parser.add_argument(
"--force-check-format", "--force-check-format",
help="Check distributinons with the specified format regardless of previous results", help="Check distributinons with the specified format regardless of previous results",
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment