Skip to content
Snippets Groups Projects
test_dcat_catalog_check.py 20.2 KiB
Newer Older
  • Learn to ignore specific revisions
  • Jesper Zedlitz's avatar
    Jesper Zedlitz committed
    import os
    import unittest
    import json
    from unittest.mock import patch, mock_open, MagicMock
    from dcat_catalog_check import (
        DcatCatalogCheck,
    )
    from rdflib import Graph
    from rdflib.namespace import RDF, DCAT
    
    from http.server import SimpleHTTPRequestHandler, HTTPServer
    import threading
    import time
    import io
    import requests
    import sys
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
    
    
    class TestDcatCatalogCheck(unittest.TestCase):
        def setUp(self):
    
            self.dcc = DcatCatalogCheck("http://test.invalid:8000/", "my_api_key")
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
            # Mock the logger to capture log messages
    
            self.logger_patch = patch.object(self.dcc, "logger", MagicMock())
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
            self.mock_logger = self.logger_patch.start()
    
        def tearDown(self):
            if os.path.exists("my_api_key"):
                os.remove("my_api_key")
            if os.path.exists("previous_results.json"):
                os.remove("previous_results.json")
            self.logger_patch.stop()
    
        def test_is_mime_type_compatible(self):
            self.dcc.allowed_file_formats = {
                "JSON": ["application/json"],
                "XML": ["application/xml"],
            }
    
    
            self.assertTrue(self.dcc.is_mime_type_compatible("JSON", "application/json"))
            self.assertFalse(self.dcc.is_mime_type_compatible("JSON", "application/xml"))
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
            self.assertFalse(
    
                self.dcc.is_mime_type_compatible("UnknownFormat", "application/json")
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
            )
    
        def test_read_allowed_file_formats(self):
            with patch(
                "builtins.open",
                unittest.mock.mock_open(
                    read_data='{"JSON": ["application/json"], "XML": ["application/xml"]}'
                ),
            ):
                formats = self.dcc.read_allowed_file_formats()
                self.assertEqual(
    
                    formats, {"JSON": ["application/json"], "XML": ["application/xml"]}
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
                )
    
        def test_load_uri_replacements(self):
            with patch("os.path.exists", return_value=True), patch(
                "builtins.open",
                unittest.mock.mock_open(
                    read_data='[{"regex": "old", "replaced_by": "new"}]'
                ),
            ):
                replacements = self.dcc.load_uri_replacements()
    
                self.assertEqual(replacements, [{"regex": "old", "replaced_by": "new"}])
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
    
        # Simulate that the file does not exist
    
        @patch("os.path.exists", return_value=False)
        def test_load_uri_replacements_file_not_exist(self, mock_exists):
            # Call the method to test
            replacements = self.dcc.load_uri_replacements()
            # Assert that it returns an empty list
            self.assertEqual(replacements, [])
    
        @patch("dcat_catalog_check.requests.get")
        def test_load_http_complete(self, mock_get):
            mock_response = MagicMock()
            mock_response.content = b"content"
            mock_get.return_value = mock_response
    
            response = self.dcc.load_http_complete("http://example.com")
            self.assertEqual(response.content, b"content")
    
        def test_get_publisher(self):
            g = Graph()
            g.parse(
                data='@prefix dcat: <http://www.w3.org/ns/dcat#> .\n@prefix dct: <http://purl.org/dc/terms/> .\n@prefix foaf: <http://xmlns.com/foaf/0.1/> .\n<http://example.org/DS> a dcat:Dataset; dct:publisher [ a foaf:Organization ; foaf:name "The publisher" ] .',
                format="ttl",
            )
            for dataset in g.subjects(predicate=RDF.type, object=DCAT.Dataset):
                result = self.dcc._get_publisher(g, dataset)
                self.assertEqual("The publisher", result)
    
        def test_get_publisher_url(self):
            g = Graph()
            g.parse(
                data="@prefix dcat: <http://www.w3.org/ns/dcat#> .\n@prefix dct: <http://purl.org/dc/terms/> .\n<http://example.org/DS> a dcat:Dataset; dct:publisher <http://example.org/publisher> .",
                format="ttl",
            )
            for dataset in g.subjects(predicate=RDF.type, object=DCAT.Dataset):
                result = self.dcc._get_publisher(g, dataset)
                self.assertEqual("http://example.org/publisher", result)
    
        def test_check_resource__json_valid(self):
            mock_response = MagicMock()
            mock_response.status_code = 200
            mock_response.reason = "OK"
            with open("tests/data/correct.json", "rb") as file:
                mock_response.content = file.read()
            self.dcc.load_http_complete = MagicMock(return_value=mock_response)
    
            resource = {}
    
            resource["url"] = "http://test.invalid/data"
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
            resource["format"] = "JSON"
            self.dcc.check_resource(resource)
            self.assertEqual(resource["accessible"], True)
            self.assertEqual(resource["valid"], True)
            self.assertEqual(resource["http_status"], 200)
            self.assertEqual(resource["mimetype"], "application/json")
    
        def test_check_resource__json_gz_valid(self):
            mock_response = MagicMock()
            mock_response.status_code = 200
            mock_response.reason = "OK"
            with open("tests/data/correct.json.gz", "rb") as file:
                mock_response.content = file.read()
            self.dcc.load_http_complete = MagicMock(return_value=mock_response)
    
            resource = {}
    
            resource["url"] = "http://test.invalid/data"
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
            resource["format"] = "JSON"
            self.dcc.check_resource(resource)
            self.assertEqual(resource["accessible"], True)
            self.assertEqual(resource["valid"], True)
            self.assertEqual(resource["http_status"], 200)
            self.assertEqual(resource["mimetype"], "application/json")
            self.assertEqual(resource["compress_format"], "application/gzip")
    
        def test_check_resource__json_bz2_valid(self):
            mock_response = MagicMock()
            mock_response.status_code = 200
            mock_response.reason = "OK"
            with open("tests/data/correct.json.bz2", "rb") as file:
                mock_response.content = file.read()
            self.dcc.load_http_complete = MagicMock(return_value=mock_response)
    
            resource = {}
    
            resource["url"] = "http://test.invalid/data"
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
            resource["format"] = "JSON"
            self.dcc.check_resource(resource)
            self.assertEqual(resource["accessible"], True)
            self.assertEqual(resource["valid"], True)
            self.assertEqual(resource["http_status"], 200)
            self.assertEqual(resource["mimetype"], "application/json")
            self.assertEqual(resource["compress_format"], "application/x-bzip2")
    
        def test_check_resource__json_xz_valid(self):
            mock_response = MagicMock()
            mock_response.status_code = 200
            mock_response.reason = "OK"
            with open("tests/data/correct.json.xz", "rb") as file:
                mock_response.content = file.read()
            self.dcc.load_http_complete = MagicMock(return_value=mock_response)
    
            resource = {}
    
            resource["url"] = "http://test.invalid/data"
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
            resource["format"] = "JSON"
            self.dcc.check_resource(resource)
            self.assertEqual(resource["accessible"], True)
            self.assertEqual(resource["valid"], True)
            self.assertEqual(resource["http_status"], 200)
            self.assertEqual(resource["mimetype"], "application/json")
            self.assertEqual(resource["compress_format"], "application/x-xz")
    
        def test_check_resource__json_invalid(self):
            mock_response = MagicMock()
            mock_response.status_code = 200
            mock_response.reason = "OK"
            with open("tests/data/incorrect.json", "rb") as file:
                mock_response.content = file.read()
            self.dcc.load_http_complete = MagicMock(return_value=mock_response)
    
            resource = {}
    
            resource["url"] = "http://test.invalid/data"
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
            resource["format"] = "JSON"
            self.dcc.check_resource(resource)
            self.assertEqual(resource["accessible"], True)
            self.assertEqual(resource["valid"], False)
            self.assertEqual(resource["http_status"], 200)
    
        def test_check_resource__xml_valid(self):
            mock_response = MagicMock()
            mock_response.status_code = 200
            mock_response.reason = "OK"
            with open("tests/data/correct.xml", "rb") as file:
                mock_response.content = file.read()
            self.dcc.load_http_complete = MagicMock(return_value=mock_response)
    
            resource = {}
    
            resource["url"] = "http://test.invalid/data"
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
            resource["format"] = "XML"
            self.dcc.check_resource(resource)
            self.assertEqual(resource["accessible"], True)
            self.assertEqual(resource["valid"], True)
            self.assertEqual(resource["http_status"], 200)
    
        def test_check_resource__png_valid(self):
            mock_response = MagicMock()
            mock_response.status_code = 200
            mock_response.reason = "OK"
            with open("tests/data/image.png", "rb") as file:
                mock_response.content = file.read()
            self.dcc.load_http_complete = MagicMock(return_value=mock_response)
    
            resource = {}
    
            resource["url"] = "http://test.invalid/data"
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
            resource["format"] = "PNG"
            resource["checksum_algorithm"] = (
                "http://spdx.org/rdf/terms#checksumAlgorithm_sha1"
            )
            resource["checksum_value"] = "a8643241029f9779302874db5c18b0f0bacbdd25"
    
            self.dcc.check_resource(resource)
            self.assertEqual(resource["accessible"], True)
            self.assertEqual(resource["valid"], True)
            self.assertEqual(resource["http_status"], 200)
            self.assertEqual(resource["mimetype"], "image/png")
            self.assertEqual(resource["checksum_ok"], True)
    
        def test_check_checksum(self):
            """The checksum check also works with the old DCAT-AT.de algorithm specifications"""
            resource = {}
            resource["checksum_algorithm"] = "http://dcat-ap.de/def/hashAlgorithms/md/5"
            resource["checksum_value"] = "7e2fb748950d6d07ab3f75ac87f6f5da"
            with open("tests/data/image.png", "rb") as file:
                self.dcc._check_checksum(resource, file)
            self.assertEqual(resource["checksum_ok"], True)
    
        def test_check_resource__one_json_in_zip_valid(self):
            """This ZIP file contains just one valid JSON file."""
            mock_response = MagicMock()
            mock_response.status_code = 200
            mock_response.reason = "OK"
            with open("tests/data/json-in-zip.zip", "rb") as file:
                mock_response.content = file.read()
            self.dcc.load_http_complete = MagicMock(return_value=mock_response)
    
            resource = {}
    
            resource["url"] = "http://test.invalid/data"
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
            resource["format"] = "JSON"
            self.dcc.check_resource(resource)
            self.assertEqual(resource.get("accessible"), True)
            self.assertEqual(resource.get("valid"), True)
            self.assertEqual(resource.get("http_status"), 200)
            self.assertEqual(resource.get("mimetype"), "application/json")
            self.assertEqual(resource.get("package_format"), "application/zip")
    
        def test_check_resource__multiple_json_files_in_zip_valid(self):
            """This ZIP file contains several valid JSON files and one image."""
            mock_response = MagicMock()
            mock_response.status_code = 200
            mock_response.reason = "OK"
            with open("tests/data/jsons-in-zip.zip", "rb") as file:
                mock_response.content = file.read()
            self.dcc.load_http_complete = MagicMock(return_value=mock_response)
    
            resource = {}
    
            resource["url"] = "http://test.invalid/data"
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
            resource["format"] = "JSON"
            self.dcc.check_resource(resource)
            self.assertEqual(resource.get("accessible"), True)
            self.assertEqual(resource.get("valid"), True)
            self.assertEqual(resource.get("http_status"), 200)
            self.assertEqual(resource.get("mimetype"), "application/json")
            self.assertEqual(resource.get("package_format"), "application/zip")
    
        def test_check_resource__no_json_in_zip_valid(self):
            """This ZIP file does not contain any JSON file only other files."""
            mock_response = MagicMock()
            mock_response.status_code = 200
            mock_response.reason = "OK"
            with open("tests/data/png-in-zip.zip", "rb") as file:
                mock_response.content = file.read()
            self.dcc.load_http_complete = MagicMock(return_value=mock_response)
    
            resource = {}
    
            resource["url"] = "http://test.invalid/data"
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
            resource["format"] = "JSON"
            self.dcc.check_resource(resource)
            self.assertEqual(resource.get("accessible"), True)
            self.assertEqual(resource.get("http_status"), 200)
            self.assertEqual(resource.get("mimetype_mismatch"), True)
            self.assertEqual(resource.get("package_format"), "application/zip")
    
        def test_is_container(self):
            self.dcc.read_allowed_file_formats()
            self.assertFalse(self.dcc._is_container("image/png", "PNG"))
            self.assertTrue(self.dcc._is_container("application/x-tar", "PNG"))
            self.assertTrue(self.dcc._is_container("application/zip", "PNG"))
            self.assertFalse(self.dcc._is_container("application/zip", "SHP"))
            self.assertFalse(self.dcc._is_container("application/zip", "GTFS"))
            self.assertFalse(self.dcc._is_container("application/zip", "ZIP"))
    
        def test_check_resource__shp_with_multiple_layers(self):
            """This shape file contains multiple layers."""
            mock_response = MagicMock()
            mock_response.status_code = 200
            mock_response.reason = "OK"
            with open("tests/data/zos116.zip", "rb") as file:
                mock_response.content = file.read()
            self.dcc.load_http_complete = MagicMock(return_value=mock_response)
    
            resource = {}
    
            resource["url"] = "http://test.invalid/zos116.zip"
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
            resource["format"] = "SHP"
    
            self.dcc.check_resource(resource)
    
            self.assertIsNone(resource.get("error"))
            self.assertEqual(resource.get("accessible"), True)
            self.assertEqual(resource.get("http_status"), 200)
            self.assertEqual(resource.get("valid"), True)
    
        def test_read_previous_results(self):
            # Test data to simulate the contents of previous_results.json
            test_data = [
                {"url": "http://example.com", "status": "valid", "format": "JSON"},
    
                {"url": "http://example.org", "status": "invalid", "format": "XML"},
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
            ]
    
            # Write test data to a file 'previous_results.json'
            with open("previous_results.json", "w", encoding="utf-8") as f:
                for entry in test_data:
                    f.write(json.dumps(entry) + "\n")
    
            # Call the method to test
            self.dcc.read_previous_results("previous_results.json")
    
            # Assertions: Check if the data was loaded correctly into previous_results
            self.assertEqual(len(self.dcc.previous_results), 2)  # Expect 2 entries
            self.assertIn("http://example.com", self.dcc.previous_results)
            self.assertIn("http://example.org", self.dcc.previous_results)
            self.assertEqual(
    
                self.dcc.previous_results["http://example.com"]["status"], "valid"
            )
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
            self.assertEqual(
    
                self.dcc.previous_results["http://example.org"]["status"], "invalid"
            )
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
    
        @patch("os.path.exists", return_value=False)
        def test_read_previous_results_file_not_exist(self, mock_exists):
            """Test when the file does not exist."""
            self.dcc.read_previous_results("non_existent_file.json")
            # Check that the warning log was triggered
            self.mock_logger.warning.assert_called_with(
                "File 'non_existent_file.json' does not exist. No previous results loaded."
            )
    
        @patch("builtins.open", mock_open(read_data="invalid_json"))
        @patch("os.path.exists", return_value=True)
        def test_read_previous_results_invalid_json(self, mock_exists):
            """Test when the file contains invalid JSON."""
            self.dcc.read_previous_results("invalid_json_file.json")
            # Check if the error log was triggered for invalid JSON
            self.mock_logger.error.assert_called_with(
                "Invalid JSON at line 1: Expecting value: line 1 column 1 (char 0)"
            )
    
    
        @patch(
            "builtins.open",
            mock_open(
                read_data='{"status": "valid", "format": "JSON"}\n{"url": "http://example.com", "status": "valid", "format": "JSON"}'
            ),
        )
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
        @patch("os.path.exists", return_value=True)
        def test_read_previous_results_missing_url(self, mock_exists):
            """Test when the file has a line with missing 'url'."""
            self.dcc.read_previous_results("missing_url_file.json")
            # Check if the warning log was triggered for the missing 'url'
            self.mock_logger.warning.assert_called_with(
                'Line 1 is missing \'url\': {"status": "valid", "format": "JSON"}'
            )
    
    
        def test_apply_uri_replacements(self):
            """Test the apply_uri_replacements method."""
            # Setup URI replacements
            self.dcc.uri_replacements = [
                {"regex": r"example\.com", "replaced_by": "test.com"},
                {"regex": r"http://", "replaced_by": "https://"},
            ]
    
            # URL matching both replacements
            url = "http://example.com/path"
            result = self.dcc.apply_uri_replacements(url)
            self.assertEqual(result, "https://test.com/path")
    
            # URL matching only one replacement
            url = "http://other.com/path"
            result = self.dcc.apply_uri_replacements(url)
            self.assertEqual(result, "https://other.com/path")
    
            # URL with no matches
            url = "https://unchanged.com/path"
            result = self.dcc.apply_uri_replacements(url)
            self.assertEqual(result, "https://unchanged.com/path")
    
            # Empty URL
            url = ""
            result = self.dcc.apply_uri_replacements(url)
            self.assertEqual(result, "")
    
            # No URI replacements defined
            self.dcc.uri_replacements = []
            url = "http://example.com/path"
            result = self.dcc.apply_uri_replacements(url)
            self.assertEqual(result, "http://example.com/path")
    
        def test_clear_result(self):
            """Test the _clear_result method."""
            # Define a resource dictionary with keys to clear and some additional keys
            resource = {
                "accessible": True,
                "checksum_ok": True,
                "duration": 1.23,
                "error": "Some error",
                "etag": "some-etag",
                "http_status": 200,
                "last_check": "2024-12-27T12:34:56Z",
                "mimetype": "application/json",
                "mimetype_mismatch": False,
                "valid": True,
                "url": "http://example.com/data",  # This key should remain untouched
                "format": "JSON",  # This key should remain untouched
            }
    
            # Call the _clear_result method
            self.dcc._clear_result(resource)
    
            # Check that all keys to clear have been removed
            for key in [
                "accessible",
                "checksum_ok",
                "duration",
                "error",
                "etag",
                "http_status",
                "last_check",
                "mimetype",
                "mimetype_mismatch",
                "valid",
            ]:
                self.assertNotIn(key, resource)
    
            # Check that unrelated keys remain
            self.assertIn("url", resource)
            self.assertIn("format", resource)
            self.assertEqual(resource["url"], "http://example.com/data")
            self.assertEqual(resource["format"], "JSON")
    
    
        def _wait_for_server(self, url, timeout=10, interval=0.2):
            """Wait until the server can be reached at the specified URL."""
            start_time = time.time()
            while time.time() - start_time < timeout:
                try:
                    response = requests.get(url)
                    if response.status_code == 200:
                        return True
                except requests.exceptions.RequestException:
                    pass
                time.sleep(interval)
            print(f"Timeout reached: Server at {url} not reachable.")
            return False
    
        def _run_server(self):
            server_address = ("", 8000)
            httpd = HTTPServer(server_address, SimpleHTTPRequestHandler)
            httpd.serve_forever()
    
        def test_read_dcat_catalog(self):
            server_thread = threading.Thread(target=self._run_server)
            server_thread.daemon = True
            server_thread.start()
            self._wait_for_server("http://localhost:8000")
    
            mock_stdout = io.StringIO()
            sys.stdout = mock_stdout
            self.dcc.read_dcat_catalog("http://localhost:8000/tests/data/all-tests.ttl")
            sys.stdout = sys.__stdout__
    
            output = mock_stdout.getvalue()
            json_objects = [json.loads(line) for line in output.splitlines()]
    
            self.assertEqual(len(json_objects), 31)
    
    
    Jesper Zedlitz's avatar
    Jesper Zedlitz committed
    
    if __name__ == "__main__":
        unittest.main()