store size of the distribution

ruff format

store size of the distribution
152647cb · Jesper Zedlitz · b072c066 · 152647cb
Verified Commit 152647cb authored 6 months ago by Jesper Zedlitz
--- a/dcat_catalog_check.py
+++ b/dcat_catalog_check.py
@@ -174,8 +174,7 @@ class DcatCatalogCheck:
        format = resource["format"].lower()
        try:
            # dynamically import the corresponding module for the format
-            format_check_module = importlib.import_module(
+            format_check_module = importlib.import_module(f"formats.{format}_format")
-                f"formats.{format}_format")
        except ModuleNotFoundError:
            format_check_module = None
@@ -194,6 +193,9 @@ class DcatCatalogCheck:
            if "etag" in response.headers:
                resource["etag"] = response.headers["etag"]
+            if "content-length" in response.headers:
+                resource["size"] = response.headers["content-length"]
        except requests.exceptions.RequestException as err:
            # Handle connection, timeout, or other request errors
            resource["accessible"] = False
@@ -210,8 +212,7 @@ class DcatCatalogCheck:
        # write the content of the HTTP response into a temporary file
        original_file_name = url.split("/")[-1]
-        suffix = original_file_name.split(
+        suffix = original_file_name.split(".")[-1] if "." in original_file_name else ""
-            ".")[-1] if "." in original_file_name else ""
        with tempfile.NamedTemporaryFile(
            delete=False, suffix="." + suffix
        ) as temp_file:
@@ -234,8 +235,7 @@ class DcatCatalogCheck:
            decompressor = decompressors.get(resource["mimetype"])
            if not decompressor:
-                self.logger.warning(
+                self.logger.warning(f"Unknown compression {resource['mimetype']}.")
-                    f"Unknown compression {resource['mimetype']}.")
            else:
                with tempfile.NamedTemporaryFile(delete=False) as decompressed_file:
                    with decompressor.open(temp_file.name, "rb") as compressed_file:
@@ -246,8 +246,7 @@ class DcatCatalogCheck:
                resource["mimetype"] = self._guess_mime_type(temp_file.name)
        if self._is_container(resource["mimetype"], resource["format"]):
-            self._check_container_file(
+            self._check_container_file(resource, temp_file, format_check_module)
-                resource, temp_file, format_check_module)
        else:
            self._check_single_file(resource, temp_file, format_check_module)
@@ -275,8 +274,7 @@ class DcatCatalogCheck:
                            temp_file.write(file.read())
                            temp_file.flush()
-                            resource["mimetype"] = self._guess_mime_type(
+                            resource["mimetype"] = self._guess_mime_type(temp_file.name)
-                                temp_file.name)
                            validation_result = (
                                validation_result
                                and self._check_single_file(
@@ -290,14 +288,12 @@ class DcatCatalogCheck:
            return contains_at_least_one_relevant_file and validation_result
        else:
-            self.logger.error(
+            self.logger.error(f"Unsupported container format {resource['mimetype']}")
-                f"Unsupported container format {resource['mimetype']}")
    def _check_single_file(self, resource, temp_file, format_check_module):
        if format_check_module:
            # call the function `process` that is defined in every modul
-            resource["valid"] = format_check_module.is_valid(
+            resource["valid"] = format_check_module.is_valid(resource, temp_file)
-                resource, temp_file)
        else:
            # There is no specialized check for the specified format.
            # Does the returned MIME type match the promised format?
@@ -322,8 +318,7 @@ class DcatCatalogCheck:
        ):
            hash_algorithm = hashlib.md5()
        else:
-            print(
+            print(f"WARNING: unknown checksum algorithm {algo_name}", file=sys.stderr)
-                f"WARNING: unknown checksum algorithm {algo_name}", file=sys.stderr)
            return
        with open(temp_file.name, "rb") as f:
@@ -418,8 +413,7 @@ class DcatCatalogCheck:
            publisher = graph.value(dataset, DCTERMS.publisher)
            if not publisher:
-                self.logger.warning(
+                self.logger.warning(f"Publisher not found for dataset: {dataset}")
-                    f"Publisher not found for dataset: {dataset}")
                return None
            # Attempt to get the publisher's name
@@ -433,8 +427,7 @@ class DcatCatalogCheck:
        except Exception as e:
            # Log any unexpected errors
-            self.logger.error(
+            self.logger.error(f"Error retrieving publisher for dataset {dataset}: {e}")
-                f"Error retrieving publisher for dataset {dataset}: {e}")
            return None
    def _process_datasets(self, datasets, g):
@@ -459,8 +452,7 @@ class DcatCatalogCheck:
                    url = str(resource["url"])
                    if self._needs_check(url):
-                        checksum_resource = g.value(
+                        checksum_resource = g.value(distribution, SPDX.checksum)
-                            distribution, SPDX.checksum)
                        if checksum_resource:
                            resource["checksum_algorithm"] = str(
                                g.value(checksum_resource, SPDX.algorithm)
@@ -481,7 +473,8 @@ class DcatCatalogCheck:
    def read_previous_results(self, file_path):
        if not os.path.exists(file_path):
            self.logger.warning(
-                f"File '{file_path}' does not exist. No previous results loaded.")
+                f"File '{file_path}' does not exist. No previous results loaded."
+            )
            return
        loaded_count = 0
@@ -500,7 +493,8 @@ class DcatCatalogCheck:
                    url = json_object.get("url")
                    if not url:
                        self.logger.warning(
-                            f"Line {line_number} is missing 'url': {line}")
+                            f"Line {line_number} is missing 'url': {line}"
+                        )
                        skipped_count += 1
                        continue
@@ -508,12 +502,12 @@ class DcatCatalogCheck:
                    loaded_count += 1
                except json.JSONDecodeError as e:
-                    self.logger.error(
+                    self.logger.error(f"Invalid JSON at line {line_number}: {e}")
-                        f"Invalid JSON at line {line_number}: {e}")
                    skipped_count += 1
        self.logger.info(
-            f"Loaded {loaded_count} results from '{file_path}', skipped {skipped_count} lines.")
+            f"Loaded {loaded_count} results from '{file_path}', skipped {skipped_count} lines."
+        )
    def read_dcat_catalog(self, url):
        while url:
@@ -536,8 +530,7 @@ class DcatCatalogCheck:
            self._process_datasets(datasets, g)
-            paged_collection = g.value(
+            paged_collection = g.value(predicate=RDF.type, object=HYDRA.PagedCollection)
-                predicate=RDF.type, object=HYDRA.PagedCollection)
            next_page = g.value(paged_collection, HYDRA.nextPage)
            url = str(next_page) if next_page else None
@@ -562,12 +555,9 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--url", help="DCAT catalog URL")
    parser.add_argument("--log_file", help="Log file path")
-    parser.add_argument(
+    parser.add_argument("--results", help="File from which the results are loaded")
-        "--results", help="File from which the results are loaded")
+    parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")
-    parser.add_argument("--verbose", action="store_true",
+    parser.add_argument("--debug", action="store_true", help="Enable debug logging")
-                        help="Enable verbose logging")
-    parser.add_argument("--debug", action="store_true",
-                        help="Enable debug logging")
    parser.add_argument(
        "--recheck",
        action="store_true",
@@ -578,8 +568,7 @@ if __name__ == "__main__":
        action="store_true",
        help="Just check new entries from the catalog. Do not re-check existing results.",
    )
-    parser.add_argument(
+    parser.add_argument("--check-format", help="Only check the specified format")
-        "--check-format", help="Only check the specified format")
    parser.add_argument(
        "--force-check-format",
        help="Check distributinons with the specified format regardless of previous results",