Skip to content
Snippets Groups Projects
test_format_fidelity_checker.py 18.8 KiB
Newer Older
  • Learn to ignore specific revisions
  • Jesper Zedlitz's avatar
    Jesper Zedlitz committed
    import os
    import unittest
    import json
    from unittest.mock import patch, mock_open, MagicMock
    from dcat_catalog_check import (
        DcatCatalogCheck,
    )
    from rdflib import Graph
    from rdflib.namespace import RDF, DCAT
    
    
    class TestDcatCatalogCheck(unittest.TestCase):
        def setUp(self):
    
            self.dcc = DcatCatalogCheck("http://test.invalid:8000/", "my_api_key")
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
            # Mock the logger to capture log messages
    
            self.logger_patch = patch.object(self.dcc, "logger", MagicMock())
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
            self.mock_logger = self.logger_patch.start()
    
        def tearDown(self):
            if os.path.exists("my_api_key"):
                os.remove("my_api_key")
            if os.path.exists("previous_results.json"):
                os.remove("previous_results.json")
            self.logger_patch.stop()
    
        def test_is_mime_type_compatible(self):
            self.dcc.allowed_file_formats = {
                "JSON": ["application/json"],
                "XML": ["application/xml"],
            }
    
    
            self.assertTrue(self.dcc.is_mime_type_compatible(
                "JSON", "application/json"))
            self.assertFalse(self.dcc.is_mime_type_compatible(
                "JSON", "application/xml"))
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
            self.assertFalse(
    
                self.dcc.is_mime_type_compatible(
                    "UnknownFormat", "application/json")
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
            )
    
        def test_read_allowed_file_formats(self):
            with patch(
                "builtins.open",
                unittest.mock.mock_open(
                    read_data='{"JSON": ["application/json"], "XML": ["application/xml"]}'
                ),
            ):
                formats = self.dcc.read_allowed_file_formats()
                self.assertEqual(
    
                    formats, {"JSON": ["application/json"],
                              "XML": ["application/xml"]}
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
                )
    
        def test_load_uri_replacements(self):
            with patch("os.path.exists", return_value=True), patch(
                "builtins.open",
                unittest.mock.mock_open(
                    read_data='[{"regex": "old", "replaced_by": "new"}]'
                ),
            ):
                replacements = self.dcc.load_uri_replacements()
    
                self.assertEqual(
                    replacements, [{"regex": "old", "replaced_by": "new"}])
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
    
        # Simulate that the file does not exist
    
        @patch("os.path.exists", return_value=False)
        def test_load_uri_replacements_file_not_exist(self, mock_exists):
            # Call the method to test
            replacements = self.dcc.load_uri_replacements()
            # Assert that it returns an empty list
            self.assertEqual(replacements, [])
    
        @patch("dcat_catalog_check.requests.get")
        def test_load_http_complete(self, mock_get):
            mock_response = MagicMock()
            mock_response.content = b"content"
            mock_get.return_value = mock_response
    
            response = self.dcc.load_http_complete("http://example.com")
            self.assertEqual(response.content, b"content")
    
        def test_get_publisher(self):
            g = Graph()
            g.parse(
                data='@prefix dcat: <http://www.w3.org/ns/dcat#> .\n@prefix dct: <http://purl.org/dc/terms/> .\n@prefix foaf: <http://xmlns.com/foaf/0.1/> .\n<http://example.org/DS> a dcat:Dataset; dct:publisher [ a foaf:Organization ; foaf:name "The publisher" ] .',
                format="ttl",
            )
            for dataset in g.subjects(predicate=RDF.type, object=DCAT.Dataset):
                result = self.dcc._get_publisher(g, dataset)
                self.assertEqual("The publisher", result)
    
        def test_get_publisher_url(self):
            g = Graph()
            g.parse(
                data="@prefix dcat: <http://www.w3.org/ns/dcat#> .\n@prefix dct: <http://purl.org/dc/terms/> .\n<http://example.org/DS> a dcat:Dataset; dct:publisher <http://example.org/publisher> .",
                format="ttl",
            )
            for dataset in g.subjects(predicate=RDF.type, object=DCAT.Dataset):
                result = self.dcc._get_publisher(g, dataset)
                self.assertEqual("http://example.org/publisher", result)
    
        def test_check_resource__json_valid(self):
            mock_response = MagicMock()
            mock_response.status_code = 200
            mock_response.reason = "OK"
            with open("tests/data/correct.json", "rb") as file:
                mock_response.content = file.read()
            self.dcc.load_http_complete = MagicMock(return_value=mock_response)
    
            resource = {}
    
            resource["url"] = "http://test.invalid/data"
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
            resource["format"] = "JSON"
            self.dcc.check_resource(resource)
            self.assertEqual(resource["accessible"], True)
            self.assertEqual(resource["valid"], True)
            self.assertEqual(resource["http_status"], 200)
            self.assertEqual(resource["mimetype"], "application/json")
    
        def test_check_resource__json_gz_valid(self):
            mock_response = MagicMock()
            mock_response.status_code = 200
            mock_response.reason = "OK"
            with open("tests/data/correct.json.gz", "rb") as file:
                mock_response.content = file.read()
            self.dcc.load_http_complete = MagicMock(return_value=mock_response)
    
            resource = {}
    
            resource["url"] = "http://test.invalid/data"
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
            resource["format"] = "JSON"
            self.dcc.check_resource(resource)
            self.assertEqual(resource["accessible"], True)
            self.assertEqual(resource["valid"], True)
            self.assertEqual(resource["http_status"], 200)
            self.assertEqual(resource["mimetype"], "application/json")
            self.assertEqual(resource["compress_format"], "application/gzip")
    
        def test_check_resource__json_bz2_valid(self):
            mock_response = MagicMock()
            mock_response.status_code = 200
            mock_response.reason = "OK"
            with open("tests/data/correct.json.bz2", "rb") as file:
                mock_response.content = file.read()
            self.dcc.load_http_complete = MagicMock(return_value=mock_response)
    
            resource = {}
    
            resource["url"] = "http://test.invalid/data"
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
            resource["format"] = "JSON"
            self.dcc.check_resource(resource)
            self.assertEqual(resource["accessible"], True)
            self.assertEqual(resource["valid"], True)
            self.assertEqual(resource["http_status"], 200)
            self.assertEqual(resource["mimetype"], "application/json")
            self.assertEqual(resource["compress_format"], "application/x-bzip2")
    
        def test_check_resource__json_xz_valid(self):
            mock_response = MagicMock()
            mock_response.status_code = 200
            mock_response.reason = "OK"
            with open("tests/data/correct.json.xz", "rb") as file:
                mock_response.content = file.read()
            self.dcc.load_http_complete = MagicMock(return_value=mock_response)
    
            resource = {}
    
            resource["url"] = "http://test.invalid/data"
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
            resource["format"] = "JSON"
            self.dcc.check_resource(resource)
            self.assertEqual(resource["accessible"], True)
            self.assertEqual(resource["valid"], True)
            self.assertEqual(resource["http_status"], 200)
            self.assertEqual(resource["mimetype"], "application/json")
            self.assertEqual(resource["compress_format"], "application/x-xz")
    
        def test_check_resource__json_invalid(self):
            mock_response = MagicMock()
            mock_response.status_code = 200
            mock_response.reason = "OK"
            with open("tests/data/incorrect.json", "rb") as file:
                mock_response.content = file.read()
            self.dcc.load_http_complete = MagicMock(return_value=mock_response)
    
            resource = {}
    
            resource["url"] = "http://test.invalid/data"
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
            resource["format"] = "JSON"
            self.dcc.check_resource(resource)
            self.assertEqual(resource["accessible"], True)
            self.assertEqual(resource["valid"], False)
            self.assertEqual(resource["http_status"], 200)
    
        def test_check_resource__xml_valid(self):
            mock_response = MagicMock()
            mock_response.status_code = 200
            mock_response.reason = "OK"
            with open("tests/data/correct.xml", "rb") as file:
                mock_response.content = file.read()
            self.dcc.load_http_complete = MagicMock(return_value=mock_response)
    
            resource = {}
    
            resource["url"] = "http://test.invalid/data"
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
            resource["format"] = "XML"
            self.dcc.check_resource(resource)
            self.assertEqual(resource["accessible"], True)
            self.assertEqual(resource["valid"], True)
            self.assertEqual(resource["http_status"], 200)
    
        def test_check_resource__png_valid(self):
            mock_response = MagicMock()
            mock_response.status_code = 200
            mock_response.reason = "OK"
            with open("tests/data/image.png", "rb") as file:
                mock_response.content = file.read()
            self.dcc.load_http_complete = MagicMock(return_value=mock_response)
    
            resource = {}
    
            resource["url"] = "http://test.invalid/data"
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
            resource["format"] = "PNG"
            resource["checksum_algorithm"] = (
                "http://spdx.org/rdf/terms#checksumAlgorithm_sha1"
            )
            resource["checksum_value"] = "a8643241029f9779302874db5c18b0f0bacbdd25"
    
            self.dcc.check_resource(resource)
            self.assertEqual(resource["accessible"], True)
            self.assertEqual(resource["valid"], True)
            self.assertEqual(resource["http_status"], 200)
            self.assertEqual(resource["mimetype"], "image/png")
            self.assertEqual(resource["checksum_ok"], True)
    
        def test_check_checksum(self):
            """The checksum check also works with the old DCAT-AT.de algorithm specifications"""
            resource = {}
            resource["checksum_algorithm"] = "http://dcat-ap.de/def/hashAlgorithms/md/5"
            resource["checksum_value"] = "7e2fb748950d6d07ab3f75ac87f6f5da"
            with open("tests/data/image.png", "rb") as file:
                self.dcc._check_checksum(resource, file)
            self.assertEqual(resource["checksum_ok"], True)
    
        def test_check_resource__one_json_in_zip_valid(self):
            """This ZIP file contains just one valid JSON file."""
            mock_response = MagicMock()
            mock_response.status_code = 200
            mock_response.reason = "OK"
            with open("tests/data/json-in-zip.zip", "rb") as file:
                mock_response.content = file.read()
            self.dcc.load_http_complete = MagicMock(return_value=mock_response)
    
            resource = {}
    
            resource["url"] = "http://test.invalid/data"
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
            resource["format"] = "JSON"
            self.dcc.check_resource(resource)
            self.assertEqual(resource.get("accessible"), True)
            self.assertEqual(resource.get("valid"), True)
            self.assertEqual(resource.get("http_status"), 200)
            self.assertEqual(resource.get("mimetype"), "application/json")
            self.assertEqual(resource.get("package_format"), "application/zip")
    
        def test_check_resource__multiple_json_files_in_zip_valid(self):
            """This ZIP file contains several valid JSON files and one image."""
            mock_response = MagicMock()
            mock_response.status_code = 200
            mock_response.reason = "OK"
            with open("tests/data/jsons-in-zip.zip", "rb") as file:
                mock_response.content = file.read()
            self.dcc.load_http_complete = MagicMock(return_value=mock_response)
    
            resource = {}
    
            resource["url"] = "http://test.invalid/data"
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
            resource["format"] = "JSON"
            self.dcc.check_resource(resource)
            self.assertEqual(resource.get("accessible"), True)
            self.assertEqual(resource.get("valid"), True)
            self.assertEqual(resource.get("http_status"), 200)
            self.assertEqual(resource.get("mimetype"), "application/json")
            self.assertEqual(resource.get("package_format"), "application/zip")
    
        def test_check_resource__no_json_in_zip_valid(self):
            """This ZIP file does not contain any JSON file only other files."""
            mock_response = MagicMock()
            mock_response.status_code = 200
            mock_response.reason = "OK"
            with open("tests/data/png-in-zip.zip", "rb") as file:
                mock_response.content = file.read()
            self.dcc.load_http_complete = MagicMock(return_value=mock_response)
    
            resource = {}
    
            resource["url"] = "http://test.invalid/data"
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
            resource["format"] = "JSON"
            self.dcc.check_resource(resource)
            self.assertEqual(resource.get("accessible"), True)
            self.assertEqual(resource.get("http_status"), 200)
            self.assertEqual(resource.get("mimetype_mismatch"), True)
            self.assertEqual(resource.get("package_format"), "application/zip")
    
        def test_is_container(self):
            self.dcc.read_allowed_file_formats()
            self.assertFalse(self.dcc._is_container("image/png", "PNG"))
            self.assertTrue(self.dcc._is_container("application/x-tar", "PNG"))
            self.assertTrue(self.dcc._is_container("application/zip", "PNG"))
            self.assertFalse(self.dcc._is_container("application/zip", "SHP"))
            self.assertFalse(self.dcc._is_container("application/zip", "GTFS"))
            self.assertFalse(self.dcc._is_container("application/zip", "ZIP"))
    
        def test_check_resource__shp_with_multiple_layers(self):
            """This shape file contains multiple layers."""
            mock_response = MagicMock()
            mock_response.status_code = 200
            mock_response.reason = "OK"
            with open("tests/data/zos116.zip", "rb") as file:
                mock_response.content = file.read()
            self.dcc.load_http_complete = MagicMock(return_value=mock_response)
    
            resource = {}
    
            resource["url"] = "http://test.invalid/zos116.zip"
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
            resource["format"] = "SHP"
    
            self.dcc.check_resource(resource)
    
            self.assertIsNone(resource.get("error"))
            self.assertEqual(resource.get("accessible"), True)
            self.assertEqual(resource.get("http_status"), 200)
            self.assertEqual(resource.get("valid"), True)
    
        def test_read_previous_results(self):
            # Test data to simulate the contents of previous_results.json
            test_data = [
                {"url": "http://example.com", "status": "valid", "format": "JSON"},
    
                {"url": "http://example.org", "status": "invalid", "format": "XML"},
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
            ]
    
            # Write test data to a file 'previous_results.json'
            with open("previous_results.json", "w", encoding="utf-8") as f:
                for entry in test_data:
                    f.write(json.dumps(entry) + "\n")
    
            # Call the method to test
            self.dcc.read_previous_results("previous_results.json")
    
            # Assertions: Check if the data was loaded correctly into previous_results
            self.assertEqual(len(self.dcc.previous_results), 2)  # Expect 2 entries
            self.assertIn("http://example.com", self.dcc.previous_results)
            self.assertIn("http://example.org", self.dcc.previous_results)
            self.assertEqual(
    
                self.dcc.previous_results["http://example.com"]["status"], "valid"
            )
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
            self.assertEqual(
    
                self.dcc.previous_results["http://example.org"]["status"], "invalid"
            )
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
    
        @patch("os.path.exists", return_value=False)
        def test_read_previous_results_file_not_exist(self, mock_exists):
            """Test when the file does not exist."""
            self.dcc.read_previous_results("non_existent_file.json")
            # Check that the warning log was triggered
            self.mock_logger.warning.assert_called_with(
                "File 'non_existent_file.json' does not exist. No previous results loaded."
            )
    
        @patch("builtins.open", mock_open(read_data="invalid_json"))
        @patch("os.path.exists", return_value=True)
        def test_read_previous_results_invalid_json(self, mock_exists):
            """Test when the file contains invalid JSON."""
            self.dcc.read_previous_results("invalid_json_file.json")
            # Check if the error log was triggered for invalid JSON
            self.mock_logger.error.assert_called_with(
                "Invalid JSON at line 1: Expecting value: line 1 column 1 (char 0)"
            )
    
    
        @patch(
            "builtins.open",
            mock_open(
                read_data='{"status": "valid", "format": "JSON"}\n{"url": "http://example.com", "status": "valid", "format": "JSON"}'
            ),
        )
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
        @patch("os.path.exists", return_value=True)
        def test_read_previous_results_missing_url(self, mock_exists):
            """Test when the file has a line with missing 'url'."""
            self.dcc.read_previous_results("missing_url_file.json")
            # Check if the warning log was triggered for the missing 'url'
            self.mock_logger.warning.assert_called_with(
                'Line 1 is missing \'url\': {"status": "valid", "format": "JSON"}'
            )
    
    
        def test_apply_uri_replacements(self):
            """Test the apply_uri_replacements method."""
            # Setup URI replacements
            self.dcc.uri_replacements = [
                {"regex": r"example\.com", "replaced_by": "test.com"},
                {"regex": r"http://", "replaced_by": "https://"},
            ]
    
            # URL matching both replacements
            url = "http://example.com/path"
            result = self.dcc.apply_uri_replacements(url)
            self.assertEqual(result, "https://test.com/path")
    
            # URL matching only one replacement
            url = "http://other.com/path"
            result = self.dcc.apply_uri_replacements(url)
            self.assertEqual(result, "https://other.com/path")
    
            # URL with no matches
            url = "https://unchanged.com/path"
            result = self.dcc.apply_uri_replacements(url)
            self.assertEqual(result, "https://unchanged.com/path")
    
            # Empty URL
            url = ""
            result = self.dcc.apply_uri_replacements(url)
            self.assertEqual(result, "")
    
            # No URI replacements defined
            self.dcc.uri_replacements = []
            url = "http://example.com/path"
            result = self.dcc.apply_uri_replacements(url)
            self.assertEqual(result, "http://example.com/path")
    
        def test_clear_result(self):
            """Test the _clear_result method."""
            # Define a resource dictionary with keys to clear and some additional keys
            resource = {
                "accessible": True,
                "checksum_ok": True,
                "duration": 1.23,
                "error": "Some error",
                "etag": "some-etag",
                "http_status": 200,
                "last_check": "2024-12-27T12:34:56Z",
                "mimetype": "application/json",
                "mimetype_mismatch": False,
                "valid": True,
                "url": "http://example.com/data",  # This key should remain untouched
                "format": "JSON",  # This key should remain untouched
            }
    
            # Call the _clear_result method
            self.dcc._clear_result(resource)
    
            # Check that all keys to clear have been removed
            for key in [
                "accessible",
                "checksum_ok",
                "duration",
                "error",
                "etag",
                "http_status",
                "last_check",
                "mimetype",
                "mimetype_mismatch",
                "valid",
            ]:
                self.assertNotIn(key, resource)
    
            # Check that unrelated keys remain
            self.assertIn("url", resource)
            self.assertIn("format", resource)
            self.assertEqual(resource["url"], "http://example.com/data")
            self.assertEqual(resource["format"], "JSON")
    
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
    
    if __name__ == "__main__":
        unittest.main()