import os import unittest import json from unittest.mock import patch, mock_open, MagicMock from dcat_catalog_check import ( DcatCatalogCheck, ) from rdflib import Graph from rdflib.namespace import RDF, DCAT class TestDcatCatalogCheck(unittest.TestCase): def setUp(self): self.dcc = DcatCatalogCheck("http://test.invalid:8000/", "my_api_key") # Mock the logger to capture log messages self.logger_patch = patch.object(self.dcc, "logger", MagicMock()) self.mock_logger = self.logger_patch.start() def tearDown(self): if os.path.exists("my_api_key"): os.remove("my_api_key") if os.path.exists("previous_results.json"): os.remove("previous_results.json") self.logger_patch.stop() def test_is_mime_type_compatible(self): self.dcc.allowed_file_formats = { "JSON": ["application/json"], "XML": ["application/xml"], } self.assertTrue(self.dcc.is_mime_type_compatible( "JSON", "application/json")) self.assertFalse(self.dcc.is_mime_type_compatible( "JSON", "application/xml")) self.assertFalse( self.dcc.is_mime_type_compatible( "UnknownFormat", "application/json") ) def test_read_allowed_file_formats(self): with patch( "builtins.open", unittest.mock.mock_open( read_data='{"JSON": ["application/json"], "XML": ["application/xml"]}' ), ): formats = self.dcc.read_allowed_file_formats() self.assertEqual( formats, {"JSON": ["application/json"], "XML": ["application/xml"]} ) def test_load_uri_replacements(self): with patch("os.path.exists", return_value=True), patch( "builtins.open", unittest.mock.mock_open( read_data='[{"regex": "old", "replaced_by": "new"}]' ), ): replacements = self.dcc.load_uri_replacements() self.assertEqual( replacements, [{"regex": "old", "replaced_by": "new"}]) # Simulate that the file does not exist @patch("os.path.exists", return_value=False) def test_load_uri_replacements_file_not_exist(self, mock_exists): # Call the method to test replacements = self.dcc.load_uri_replacements() # Assert that it returns an empty list self.assertEqual(replacements, []) @patch("dcat_catalog_check.requests.get") def test_load_http_complete(self, mock_get): mock_response = MagicMock() mock_response.content = b"content" mock_get.return_value = mock_response response = self.dcc.load_http_complete("http://example.com") self.assertEqual(response.content, b"content") def test_get_publisher(self): g = Graph() g.parse( data='@prefix dcat: <http://www.w3.org/ns/dcat#> .\n@prefix dct: <http://purl.org/dc/terms/> .\n@prefix foaf: <http://xmlns.com/foaf/0.1/> .\n<http://example.org/DS> a dcat:Dataset; dct:publisher [ a foaf:Organization ; foaf:name "The publisher" ] .', format="ttl", ) for dataset in g.subjects(predicate=RDF.type, object=DCAT.Dataset): result = self.dcc._get_publisher(g, dataset) self.assertEqual("The publisher", result) def test_get_publisher_url(self): g = Graph() g.parse( data="@prefix dcat: <http://www.w3.org/ns/dcat#> .\n@prefix dct: <http://purl.org/dc/terms/> .\n<http://example.org/DS> a dcat:Dataset; dct:publisher <http://example.org/publisher> .", format="ttl", ) for dataset in g.subjects(predicate=RDF.type, object=DCAT.Dataset): result = self.dcc._get_publisher(g, dataset) self.assertEqual("http://example.org/publisher", result) def test_check_resource__json_valid(self): mock_response = MagicMock() mock_response.status_code = 200 mock_response.reason = "OK" with open("tests/data/correct.json", "rb") as file: mock_response.content = file.read() self.dcc.load_http_complete = MagicMock(return_value=mock_response) resource = {} resource["url"] = "http://test.invalid/data" resource["format"] = "JSON" self.dcc.check_resource(resource) self.assertEqual(resource["accessible"], True) self.assertEqual(resource["valid"], True) self.assertEqual(resource["http_status"], 200) self.assertEqual(resource["mimetype"], "application/json") def test_check_resource__json_gz_valid(self): mock_response = MagicMock() mock_response.status_code = 200 mock_response.reason = "OK" with open("tests/data/correct.json.gz", "rb") as file: mock_response.content = file.read() self.dcc.load_http_complete = MagicMock(return_value=mock_response) resource = {} resource["url"] = "http://test.invalid/data" resource["format"] = "JSON" self.dcc.check_resource(resource) self.assertEqual(resource["accessible"], True) self.assertEqual(resource["valid"], True) self.assertEqual(resource["http_status"], 200) self.assertEqual(resource["mimetype"], "application/json") self.assertEqual(resource["compress_format"], "application/gzip") def test_check_resource__json_bz2_valid(self): mock_response = MagicMock() mock_response.status_code = 200 mock_response.reason = "OK" with open("tests/data/correct.json.bz2", "rb") as file: mock_response.content = file.read() self.dcc.load_http_complete = MagicMock(return_value=mock_response) resource = {} resource["url"] = "http://test.invalid/data" resource["format"] = "JSON" self.dcc.check_resource(resource) self.assertEqual(resource["accessible"], True) self.assertEqual(resource["valid"], True) self.assertEqual(resource["http_status"], 200) self.assertEqual(resource["mimetype"], "application/json") self.assertEqual(resource["compress_format"], "application/x-bzip2") def test_check_resource__json_xz_valid(self): mock_response = MagicMock() mock_response.status_code = 200 mock_response.reason = "OK" with open("tests/data/correct.json.xz", "rb") as file: mock_response.content = file.read() self.dcc.load_http_complete = MagicMock(return_value=mock_response) resource = {} resource["url"] = "http://test.invalid/data" resource["format"] = "JSON" self.dcc.check_resource(resource) self.assertEqual(resource["accessible"], True) self.assertEqual(resource["valid"], True) self.assertEqual(resource["http_status"], 200) self.assertEqual(resource["mimetype"], "application/json") self.assertEqual(resource["compress_format"], "application/x-xz") def test_check_resource__json_invalid(self): mock_response = MagicMock() mock_response.status_code = 200 mock_response.reason = "OK" with open("tests/data/incorrect.json", "rb") as file: mock_response.content = file.read() self.dcc.load_http_complete = MagicMock(return_value=mock_response) resource = {} resource["url"] = "http://test.invalid/data" resource["format"] = "JSON" self.dcc.check_resource(resource) self.assertEqual(resource["accessible"], True) self.assertEqual(resource["valid"], False) self.assertEqual(resource["http_status"], 200) def test_check_resource__xml_valid(self): mock_response = MagicMock() mock_response.status_code = 200 mock_response.reason = "OK" with open("tests/data/correct.xml", "rb") as file: mock_response.content = file.read() self.dcc.load_http_complete = MagicMock(return_value=mock_response) resource = {} resource["url"] = "http://test.invalid/data" resource["format"] = "XML" self.dcc.check_resource(resource) self.assertEqual(resource["accessible"], True) self.assertEqual(resource["valid"], True) self.assertEqual(resource["http_status"], 200) def test_check_resource__png_valid(self): mock_response = MagicMock() mock_response.status_code = 200 mock_response.reason = "OK" with open("tests/data/image.png", "rb") as file: mock_response.content = file.read() self.dcc.load_http_complete = MagicMock(return_value=mock_response) resource = {} resource["url"] = "http://test.invalid/data" resource["format"] = "PNG" resource["checksum_algorithm"] = ( "http://spdx.org/rdf/terms#checksumAlgorithm_sha1" ) resource["checksum_value"] = "a8643241029f9779302874db5c18b0f0bacbdd25" self.dcc.check_resource(resource) self.assertEqual(resource["accessible"], True) self.assertEqual(resource["valid"], True) self.assertEqual(resource["http_status"], 200) self.assertEqual(resource["mimetype"], "image/png") self.assertEqual(resource["checksum_ok"], True) def test_check_checksum(self): """The checksum check also works with the old DCAT-AT.de algorithm specifications""" resource = {} resource["checksum_algorithm"] = "http://dcat-ap.de/def/hashAlgorithms/md/5" resource["checksum_value"] = "7e2fb748950d6d07ab3f75ac87f6f5da" with open("tests/data/image.png", "rb") as file: self.dcc._check_checksum(resource, file) self.assertEqual(resource["checksum_ok"], True) def test_check_resource__one_json_in_zip_valid(self): """This ZIP file contains just one valid JSON file.""" mock_response = MagicMock() mock_response.status_code = 200 mock_response.reason = "OK" with open("tests/data/json-in-zip.zip", "rb") as file: mock_response.content = file.read() self.dcc.load_http_complete = MagicMock(return_value=mock_response) resource = {} resource["url"] = "http://test.invalid/data" resource["format"] = "JSON" self.dcc.check_resource(resource) self.assertEqual(resource.get("accessible"), True) self.assertEqual(resource.get("valid"), True) self.assertEqual(resource.get("http_status"), 200) self.assertEqual(resource.get("mimetype"), "application/json") self.assertEqual(resource.get("package_format"), "application/zip") def test_check_resource__multiple_json_files_in_zip_valid(self): """This ZIP file contains several valid JSON files and one image.""" mock_response = MagicMock() mock_response.status_code = 200 mock_response.reason = "OK" with open("tests/data/jsons-in-zip.zip", "rb") as file: mock_response.content = file.read() self.dcc.load_http_complete = MagicMock(return_value=mock_response) resource = {} resource["url"] = "http://test.invalid/data" resource["format"] = "JSON" self.dcc.check_resource(resource) self.assertEqual(resource.get("accessible"), True) self.assertEqual(resource.get("valid"), True) self.assertEqual(resource.get("http_status"), 200) self.assertEqual(resource.get("mimetype"), "application/json") self.assertEqual(resource.get("package_format"), "application/zip") def test_check_resource__no_json_in_zip_valid(self): """This ZIP file does not contain any JSON file only other files.""" mock_response = MagicMock() mock_response.status_code = 200 mock_response.reason = "OK" with open("tests/data/png-in-zip.zip", "rb") as file: mock_response.content = file.read() self.dcc.load_http_complete = MagicMock(return_value=mock_response) resource = {} resource["url"] = "http://test.invalid/data" resource["format"] = "JSON" self.dcc.check_resource(resource) self.assertEqual(resource.get("accessible"), True) self.assertEqual(resource.get("http_status"), 200) self.assertEqual(resource.get("mimetype_mismatch"), True) self.assertEqual(resource.get("package_format"), "application/zip") def test_is_container(self): self.dcc.read_allowed_file_formats() self.assertFalse(self.dcc._is_container("image/png", "PNG")) self.assertTrue(self.dcc._is_container("application/x-tar", "PNG")) self.assertTrue(self.dcc._is_container("application/zip", "PNG")) self.assertFalse(self.dcc._is_container("application/zip", "SHP")) self.assertFalse(self.dcc._is_container("application/zip", "GTFS")) self.assertFalse(self.dcc._is_container("application/zip", "ZIP")) def test_check_resource__shp_with_multiple_layers(self): """This shape file contains multiple layers.""" mock_response = MagicMock() mock_response.status_code = 200 mock_response.reason = "OK" with open("tests/data/zos116.zip", "rb") as file: mock_response.content = file.read() self.dcc.load_http_complete = MagicMock(return_value=mock_response) resource = {} resource["url"] = "http://test.invalid/zos116.zip" resource["format"] = "SHP" self.dcc.check_resource(resource) self.assertIsNone(resource.get("error")) self.assertEqual(resource.get("accessible"), True) self.assertEqual(resource.get("http_status"), 200) self.assertEqual(resource.get("valid"), True) def test_read_previous_results(self): # Test data to simulate the contents of previous_results.json test_data = [ {"url": "http://example.com", "status": "valid", "format": "JSON"}, {"url": "http://example.org", "status": "invalid", "format": "XML"}, ] # Write test data to a file 'previous_results.json' with open("previous_results.json", "w", encoding="utf-8") as f: for entry in test_data: f.write(json.dumps(entry) + "\n") # Call the method to test self.dcc.read_previous_results("previous_results.json") # Assertions: Check if the data was loaded correctly into previous_results self.assertEqual(len(self.dcc.previous_results), 2) # Expect 2 entries self.assertIn("http://example.com", self.dcc.previous_results) self.assertIn("http://example.org", self.dcc.previous_results) self.assertEqual( self.dcc.previous_results["http://example.com"]["status"], "valid" ) self.assertEqual( self.dcc.previous_results["http://example.org"]["status"], "invalid" ) @patch("os.path.exists", return_value=False) def test_read_previous_results_file_not_exist(self, mock_exists): """Test when the file does not exist.""" self.dcc.read_previous_results("non_existent_file.json") # Check that the warning log was triggered self.mock_logger.warning.assert_called_with( "File 'non_existent_file.json' does not exist. No previous results loaded." ) @patch("builtins.open", mock_open(read_data="invalid_json")) @patch("os.path.exists", return_value=True) def test_read_previous_results_invalid_json(self, mock_exists): """Test when the file contains invalid JSON.""" self.dcc.read_previous_results("invalid_json_file.json") # Check if the error log was triggered for invalid JSON self.mock_logger.error.assert_called_with( "Invalid JSON at line 1: Expecting value: line 1 column 1 (char 0)" ) @patch( "builtins.open", mock_open( read_data='{"status": "valid", "format": "JSON"}\n{"url": "http://example.com", "status": "valid", "format": "JSON"}' ), ) @patch("os.path.exists", return_value=True) def test_read_previous_results_missing_url(self, mock_exists): """Test when the file has a line with missing 'url'.""" self.dcc.read_previous_results("missing_url_file.json") # Check if the warning log was triggered for the missing 'url' self.mock_logger.warning.assert_called_with( 'Line 1 is missing \'url\': {"status": "valid", "format": "JSON"}' ) def test_apply_uri_replacements(self): """Test the apply_uri_replacements method.""" # Setup URI replacements self.dcc.uri_replacements = [ {"regex": r"example\.com", "replaced_by": "test.com"}, {"regex": r"http://", "replaced_by": "https://"}, ] # URL matching both replacements url = "http://example.com/path" result = self.dcc.apply_uri_replacements(url) self.assertEqual(result, "https://test.com/path") # URL matching only one replacement url = "http://other.com/path" result = self.dcc.apply_uri_replacements(url) self.assertEqual(result, "https://other.com/path") # URL with no matches url = "https://unchanged.com/path" result = self.dcc.apply_uri_replacements(url) self.assertEqual(result, "https://unchanged.com/path") # Empty URL url = "" result = self.dcc.apply_uri_replacements(url) self.assertEqual(result, "") # No URI replacements defined self.dcc.uri_replacements = [] url = "http://example.com/path" result = self.dcc.apply_uri_replacements(url) self.assertEqual(result, "http://example.com/path") def test_clear_result(self): """Test the _clear_result method.""" # Define a resource dictionary with keys to clear and some additional keys resource = { "accessible": True, "checksum_ok": True, "duration": 1.23, "error": "Some error", "etag": "some-etag", "http_status": 200, "last_check": "2024-12-27T12:34:56Z", "mimetype": "application/json", "mimetype_mismatch": False, "valid": True, "url": "http://example.com/data", # This key should remain untouched "format": "JSON", # This key should remain untouched } # Call the _clear_result method self.dcc._clear_result(resource) # Check that all keys to clear have been removed for key in [ "accessible", "checksum_ok", "duration", "error", "etag", "http_status", "last_check", "mimetype", "mimetype_mismatch", "valid", ]: self.assertNotIn(key, resource) # Check that unrelated keys remain self.assertIn("url", resource) self.assertIn("format", resource) self.assertEqual(resource["url"], "http://example.com/data") self.assertEqual(resource["format"], "JSON") if __name__ == "__main__": unittest.main()