fix: keys in _clear_result() and code formatting

e7b06056 · Thorge Petersen · ca271b42 · e7b06056
Commit e7b06056 authored 4 months ago by Thorge Petersen
--- a/dcat_catalog_check.py
+++ b/dcat_catalog_check.py
@@ -154,7 +154,8 @@ class DcatCatalogCheck:
            "error",
            "etag",
            "http_status",
-            "last_check" "mimetype",
+            "last_check",
+            "mimetype",
            "mimetype_mismatch",
            "valid",
        ]:
@@ -174,7 +175,8 @@ class DcatCatalogCheck:
        format = resource["format"].lower()
        try:
            # dynamically import the corresponding module for the format
-            format_check_module = importlib.import_module(f"formats.{format}_format")
+            format_check_module = importlib.import_module(
+                f"formats.{format}_format")
        except ModuleNotFoundError:
            format_check_module = None
@@ -212,7 +214,8 @@ class DcatCatalogCheck:
        # write the content of the HTTP response into a temporary file
        original_file_name = url.split("/")[-1]
-        suffix = original_file_name.split(".")[-1] if "." in original_file_name else ""
+        suffix = original_file_name.split(
+            ".")[-1] if "." in original_file_name else ""
        with tempfile.NamedTemporaryFile(
            delete=False, suffix="." + suffix
        ) as temp_file:
@@ -235,7 +238,8 @@ class DcatCatalogCheck:
            decompressor = decompressors.get(resource["mimetype"])
            if not decompressor:
-                self.logger.warning(f"Unknown compression {resource['mimetype']}.")
+                self.logger.warning(
+                    f"Unknown compression {resource['mimetype']}.")
            else:
                with tempfile.NamedTemporaryFile(delete=False) as decompressed_file:
                    with decompressor.open(temp_file.name, "rb") as compressed_file:
@@ -246,7 +250,8 @@ class DcatCatalogCheck:
                resource["mimetype"] = self._guess_mime_type(temp_file.name)
        if self._is_container(resource["mimetype"], resource["format"]):
-            self._check_container_file(resource, temp_file, format_check_module)
+            self._check_container_file(
+                resource, temp_file, format_check_module)
        else:
            self._check_single_file(resource, temp_file, format_check_module)
@@ -274,7 +279,8 @@ class DcatCatalogCheck:
                            temp_file.write(file.read())
                            temp_file.flush()
-                            resource["mimetype"] = self._guess_mime_type(temp_file.name)
+                            resource["mimetype"] = self._guess_mime_type(
+                                temp_file.name)
                            validation_result = (
                                validation_result
                                and self._check_single_file(
@@ -288,12 +294,14 @@ class DcatCatalogCheck:
            return contains_at_least_one_relevant_file and validation_result
        else:
-            self.logger.error(f"Unsupported container format {resource['mimetype']}")
+            self.logger.error(
+                f"Unsupported container format {resource['mimetype']}")
    def _check_single_file(self, resource, temp_file, format_check_module):
        if format_check_module:
            # call the function `process` that is defined in every modul
-            resource["valid"] = format_check_module.is_valid(resource, temp_file)
+            resource["valid"] = format_check_module.is_valid(
+                resource, temp_file)
        else:
            # There is no specialized check for the specified format.
            # Does the returned MIME type match the promised format?
@@ -318,7 +326,8 @@ class DcatCatalogCheck:
        ):
            hash_algorithm = hashlib.md5()
        else:
-            print(f"WARNING: unknown checksum algorithm {algo_name}", file=sys.stderr)
+            print(
+                f"WARNING: unknown checksum algorithm {algo_name}", file=sys.stderr)
            return
        with open(temp_file.name, "rb") as f:
@@ -413,7 +422,8 @@ class DcatCatalogCheck:
            publisher = graph.value(dataset, DCTERMS.publisher)
            if not publisher:
-                self.logger.warning(f"Publisher not found for dataset: {dataset}")
+                self.logger.warning(
+                    f"Publisher not found for dataset: {dataset}")
                return None
            # Attempt to get the publisher's name
@@ -427,7 +437,8 @@ class DcatCatalogCheck:
        except Exception as e:
            # Log any unexpected errors
-            self.logger.error(f"Error retrieving publisher for dataset {dataset}: {e}")
+            self.logger.error(
+                f"Error retrieving publisher for dataset {dataset}: {e}")
            return None
    def _process_datasets(self, datasets, g):
@@ -452,7 +463,8 @@ class DcatCatalogCheck:
                    url = str(resource["url"])
                    if self._needs_check(url):
-                        checksum_resource = g.value(distribution, SPDX.checksum)
+                        checksum_resource = g.value(
+                            distribution, SPDX.checksum)
                        if checksum_resource:
                            resource["checksum_algorithm"] = str(
                                g.value(checksum_resource, SPDX.algorithm)
@@ -502,7 +514,8 @@ class DcatCatalogCheck:
                    loaded_count += 1
                except json.JSONDecodeError as e:
-                    self.logger.error(f"Invalid JSON at line {line_number}: {e}")
+                    self.logger.error(
+                        f"Invalid JSON at line {line_number}: {e}")
                    skipped_count += 1
        self.logger.info(
@@ -530,7 +543,8 @@ class DcatCatalogCheck:
            self._process_datasets(datasets, g)
-            paged_collection = g.value(predicate=RDF.type, object=HYDRA.PagedCollection)
+            paged_collection = g.value(
+                predicate=RDF.type, object=HYDRA.PagedCollection)
            next_page = g.value(paged_collection, HYDRA.nextPage)
            url = str(next_page) if next_page else None
@@ -555,9 +569,12 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--url", help="DCAT catalog URL")
    parser.add_argument("--log_file", help="Log file path")
-    parser.add_argument("--results", help="File from which the results are loaded")
+    parser.add_argument(
-    parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")
+        "--results", help="File from which the results are loaded")
-    parser.add_argument("--debug", action="store_true", help="Enable debug logging")
+    parser.add_argument("--verbose", action="store_true",
+                        help="Enable verbose logging")
+    parser.add_argument("--debug", action="store_true",
+                        help="Enable debug logging")
    parser.add_argument(
        "--recheck",
        action="store_true",
@@ -568,7 +585,8 @@ if __name__ == "__main__":
        action="store_true",
        help="Just check new entries from the catalog. Do not re-check existing results.",
    )
-    parser.add_argument("--check-format", help="Only check the specified format")
+    parser.add_argument(
+        "--check-format", help="Only check the specified format")
    parser.add_argument(
        "--force-check-format",
        help="Check distributinons with the specified format regardless of previous results",