diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000000000000000000000000000000000000..49addf786e2886ced2e5889f3367aedfdae7b4de --- /dev/null +++ b/.coveragerc @@ -0,0 +1,3 @@ +[run] +omit = + tests/* \ No newline at end of file diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index aa31e12a46c7917d324bb1732055a60621a77e88..8e5a53416e5c7a8a1fabdf0bfdf06f0f1ed92e77 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -20,6 +20,9 @@ ruff: image: python:3.10 stage: lint before_script: + # Install libgdal-dev + - apt-get update + - apt-get install -y libgdal-dev # Install pipx - python3 -m pip install --user pipx - python3 -m pipx ensurepath @@ -36,6 +39,9 @@ test: image: python:3.10 stage: test before_script: + # Install libgdal-dev + - apt-get update + - apt-get install -y libgdal-dev # Install pipx - python3 -m pip install --user pipx - python3 -m pipx ensurepath diff --git a/CHANGELOG.md b/CHANGELOG.md index 32cf7cc6cd30617047dbc698a36744c6b3173a25..cda2fb141d079793831f422e52935fe2c2da7a81 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,27 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.1.0] - 2025-01-09 + +### Added + +- **Unit Tests**: + - URI replacements and resource clearing functionality. + - Support for multiple formats: Atom, DOCX, GeoTIFF, GeoJSON, JPEG, ODT, ODS, PDF, RDF, TXT, WMTS, XLSX. + - Frictionless Data Resource validation. + +- **Report Generation**: + - Added columns for **HTTP status** and **error message** in the generated reports. + - Implemented **filters** for table columns, allowing users to refine data views. + +### Changed + +- **Coverage Configuration**: + - Updated coverage settings to better manage test file inclusion/exclusion. + - Test files are now excluded from coverage reports to focus on measuring application code quality. + +- **Dockerfile**: Switched base image to `python:3.10` and updated installation steps for dependencies, pipx, and Poetry. + ## [1.0.0] - 2024-12-20 ### Added diff --git a/Dockerfile b/Dockerfile index 895863c5b81f5d3197b1fdfcf281984e4820b780..68a01d46b95abe990db39ed6aecd0f172f189536 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,11 +1,25 @@ -FROM alpine +FROM python:3.10 # Install necessary system dependencies -RUN apk add --no-cache poetry proj-util gdal-dev gcc python3-dev musl-dev geos-dev proj-dev libmagic +RUN apt-get update && \ + apt-get install -y \ + libgdal-dev \ + libmagic-dev \ + gcc \ + python3-dev \ + musl-dev \ + libgeos-dev \ + libproj-dev \ + && python3 -m pip install --upgrade pip \ + && python3 -m pip install pipx \ + && python3 -m pipx ensurepath -# Set the PATH for pipx +# Ensure pipx is in the PATH ENV PATH="/root/.local/bin:${PATH}" +# Install poetry using pipx +RUN pipx install poetry + # Set the working directory inside the container WORKDIR /app diff --git a/README.md b/README.md index 12d3898b6cc38f35ad59ace2bf9c477abcb21f2f..e891a00ad61d9c21bebce4eab83fdb04fc874b27 100644 --- a/README.md +++ b/README.md @@ -30,16 +30,26 @@ The following format checks are currently being carried out: | Format | Check | | --------- | ------- | -| `GEOJSON` | Load the file using [`GeoPandas`](https://geopandas.org). | -| `GML` | Load the file using [`GeoPandas`](https://geopandas.org). | -| `JPEG` | Load the image. | -| `JSON` | Is it syntactically correct JSON? If it is a *Frictionless Data Resource*, it is checked with the Frictionless Tools. | -| `PNG` | Load the image. | -| `PDF` | Load the document using [`pypdf`](https://pypi.org/project/pypdf/). | -| `SHP` | Load the file using [`GeoPandas`](https://geopandas.org). | -| `WFS` | Is it a valid well-formed `WMS_Capabilities` XML document? If the address does not contain the `request=GetCapabilities` parameter, a `GetCapabilities` request is performed. This response is then checked. | -| `WMS` | Is it a valid well-formed `WFS_Capabilities` XML document? If the address does not contain the `request=GetCapabilities` parameter, a `GetCapabilities` request is performed. This response is then checked. | -| `XML` | Is it well-formed XML? | +| `ATOM` | Validates whether the file content is a valid ATOM feed by confirming the root element is `<feed>` in the Atom XML namespace. | +| `DOCX` | Verifies that the file is a valid DOCX by ensuring the ZIP archive contains the necessary XML files (`document.xml` and `styles.xml`). | +| `GEOJSON` | Loads and validates the file using [`GeoPandas`](https://geopandas.org). | +| `GEOTIFF` | Verifies the file is a valid GeoTIFF by checking its GeoTransform information and supports both standalone and ZIP-compressed GeoTIFF formats. | +| `GML` | Loads and validates the file using [`GeoPandas`](https://geopandas.org). | +| `JPEG` | Loads and validates the image file. | +| `JSON` | Verifies that the file is syntactically correct JSON and, if it is a *Frictionless Data Resource*, checks it using Frictionless Tools. | +| `ODS` | Validates that the file is a valid ODS (OpenDocument Spreadsheet) by checking the ZIP structure, required files, and correct MIME type. | +| `ODT` | Validates that the file is a valid ODT (OpenDocument Text) by confirming the ZIP structure, required files, and correct MIME type. | +| `PARQUET` | Verifies that the file is a readable Apache Parquet file by loading it using [`pandas`](https://pandas.pydata.org/). | +| `PDF` | Loads and validates the PDF document using [`pypdf`](https://pypi.org/project/pypdf/). | +| `PNG` | Loads and validates the image file. | +| `RDF` | Verifies the file is a valid RDF (Resource Description Framework) document and contains at least two statements. | +| `SHP` | Loads and validates the file using [`GeoPandas`](https://geopandas.org). | +| `WFS` | Validates if the file is a well-formed `WMS_Capabilities` XML document. If not, a `GetCapabilities` request is made and validated. | +| `WMS` | Validates if the file is a well-formed `WFS_Capabilities` XML document. If not, a `GetCapabilities` request is made and validated. | +| `WMTS` | Validates if the file contains a valid WMTS (Web Map Tile Service) capabilities XML response, either directly or by performing a `GetCapabilities` request. | +| `XLSX` | Verifies that the file is a ZIP archive and contains the required files (`xl/workbook.xml` and `xl/styles.xml`) typical of a valid XLSX file. | +| `XML` | Verifies if the file is well-formed XML. | +| `ZIP` | Verifies if the file is a valid ZIP archive using Python's `zipfile.is_zipfile()` method. | ## Installation @@ -208,8 +218,6 @@ In this example: 2. Define the desired replacements in the JSON array format described above. 3. Run the script as usual. If the file exists, replacements will be applied automatically. -By using `uri_replacements.json`, you can streamline URL handling and ensure consistent preprocessing for your link-checking tasks. - ## Docker You can run the script in a Docker container. See the [Dockerfile](./Dockerfile) for more information. @@ -225,7 +233,7 @@ You can run the script in a Docker container. See the [Dockerfile](./Dockerfile) 2. Run the Docker container: ```sh - docker run --rm dcat-catalog-check --url https://example.com + docker run --rm dcat-catalog-check --url https://example.com ``` ## Tests diff --git a/dcat_catalog_check.py b/dcat_catalog_check.py index 91dde8c09e5edccafbe07009c9e48409432f1bf8..86bd20550a67aa0158c327755b53485599de729a 100755 --- a/dcat_catalog_check.py +++ b/dcat_catalog_check.py @@ -154,7 +154,8 @@ class DcatCatalogCheck: "error", "etag", "http_status", - "last_check" "mimetype", + "last_check", + "mimetype", "mimetype_mismatch", "valid", ]: @@ -174,8 +175,7 @@ class DcatCatalogCheck: format = resource["format"].lower() try: # dynamically import the corresponding module for the format - format_check_module = importlib.import_module( - f"formats.{format}_format") + format_check_module = importlib.import_module(f"formats.{format}_format") except ModuleNotFoundError: format_check_module = None @@ -194,6 +194,9 @@ class DcatCatalogCheck: if "etag" in response.headers: resource["etag"] = response.headers["etag"] + if "content-length" in response.headers: + resource["size"] = response.headers["content-length"] + except requests.exceptions.RequestException as err: # Handle connection, timeout, or other request errors resource["accessible"] = False @@ -210,8 +213,7 @@ class DcatCatalogCheck: # write the content of the HTTP response into a temporary file original_file_name = url.split("/")[-1] - suffix = original_file_name.split( - ".")[-1] if "." in original_file_name else "" + suffix = original_file_name.split(".")[-1] if "." in original_file_name else "" with tempfile.NamedTemporaryFile( delete=False, suffix="." + suffix ) as temp_file: @@ -234,8 +236,7 @@ class DcatCatalogCheck: decompressor = decompressors.get(resource["mimetype"]) if not decompressor: - self.logger.warning( - f"Unknown compression {resource['mimetype']}.") + self.logger.warning(f"Unknown compression {resource['mimetype']}.") else: with tempfile.NamedTemporaryFile(delete=False) as decompressed_file: with decompressor.open(temp_file.name, "rb") as compressed_file: @@ -245,9 +246,10 @@ class DcatCatalogCheck: temp_file = decompressed_file resource["mimetype"] = self._guess_mime_type(temp_file.name) - if self._is_container(resource["mimetype"], resource["format"]): - self._check_container_file( - resource, temp_file, format_check_module) + if self._is_container(resource["mimetype"], resource["format"]) and resource[ + "format" + ] not in ["GTFS", "GEOTIFF", "SHP"]: + self._check_container_file(resource, temp_file, format_check_module) else: self._check_single_file(resource, temp_file, format_check_module) @@ -275,8 +277,7 @@ class DcatCatalogCheck: temp_file.write(file.read()) temp_file.flush() - resource["mimetype"] = self._guess_mime_type( - temp_file.name) + resource["mimetype"] = self._guess_mime_type(temp_file.name) validation_result = ( validation_result and self._check_single_file( @@ -290,14 +291,12 @@ class DcatCatalogCheck: return contains_at_least_one_relevant_file and validation_result else: - self.logger.error( - f"Unsupported container format {resource['mimetype']}") + self.logger.error(f"Unsupported container format {resource['mimetype']}") def _check_single_file(self, resource, temp_file, format_check_module): if format_check_module: # call the function `process` that is defined in every modul - resource["valid"] = format_check_module.is_valid( - resource, temp_file) + resource["valid"] = format_check_module.is_valid(resource, temp_file) else: # There is no specialized check for the specified format. # Does the returned MIME type match the promised format? @@ -322,8 +321,7 @@ class DcatCatalogCheck: ): hash_algorithm = hashlib.md5() else: - print( - f"WARNING: unknown checksum algorithm {algo_name}", file=sys.stderr) + print(f"WARNING: unknown checksum algorithm {algo_name}", file=sys.stderr) return with open(temp_file.name, "rb") as f: @@ -418,8 +416,7 @@ class DcatCatalogCheck: publisher = graph.value(dataset, DCTERMS.publisher) if not publisher: - self.logger.warning( - f"Publisher not found for dataset: {dataset}") + self.logger.warning(f"Publisher not found for dataset: {dataset}") return None # Attempt to get the publisher's name @@ -433,8 +430,7 @@ class DcatCatalogCheck: except Exception as e: # Log any unexpected errors - self.logger.error( - f"Error retrieving publisher for dataset {dataset}: {e}") + self.logger.error(f"Error retrieving publisher for dataset {dataset}: {e}") return None def _process_datasets(self, datasets, g): @@ -459,8 +455,7 @@ class DcatCatalogCheck: url = str(resource["url"]) if self._needs_check(url): - checksum_resource = g.value( - distribution, SPDX.checksum) + checksum_resource = g.value(distribution, SPDX.checksum) if checksum_resource: resource["checksum_algorithm"] = str( g.value(checksum_resource, SPDX.algorithm) @@ -481,7 +476,8 @@ class DcatCatalogCheck: def read_previous_results(self, file_path): if not os.path.exists(file_path): self.logger.warning( - f"File '{file_path}' does not exist. No previous results loaded.") + f"File '{file_path}' does not exist. No previous results loaded." + ) return loaded_count = 0 @@ -500,7 +496,8 @@ class DcatCatalogCheck: url = json_object.get("url") if not url: self.logger.warning( - f"Line {line_number} is missing 'url': {line}") + f"Line {line_number} is missing 'url': {line}" + ) skipped_count += 1 continue @@ -508,12 +505,12 @@ class DcatCatalogCheck: loaded_count += 1 except json.JSONDecodeError as e: - self.logger.error( - f"Invalid JSON at line {line_number}: {e}") + self.logger.error(f"Invalid JSON at line {line_number}: {e}") skipped_count += 1 self.logger.info( - f"Loaded {loaded_count} results from '{file_path}', skipped {skipped_count} lines.") + f"Loaded {loaded_count} results from '{file_path}', skipped {skipped_count} lines." + ) def read_dcat_catalog(self, url): while url: @@ -536,8 +533,7 @@ class DcatCatalogCheck: self._process_datasets(datasets, g) - paged_collection = g.value( - predicate=RDF.type, object=HYDRA.PagedCollection) + paged_collection = g.value(predicate=RDF.type, object=HYDRA.PagedCollection) next_page = g.value(paged_collection, HYDRA.nextPage) url = str(next_page) if next_page else None @@ -562,12 +558,9 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--url", help="DCAT catalog URL") parser.add_argument("--log_file", help="Log file path") - parser.add_argument( - "--results", help="File from which the results are loaded") - parser.add_argument("--verbose", action="store_true", - help="Enable verbose logging") - parser.add_argument("--debug", action="store_true", - help="Enable debug logging") + parser.add_argument("--results", help="File from which the results are loaded") + parser.add_argument("--verbose", action="store_true", help="Enable verbose logging") + parser.add_argument("--debug", action="store_true", help="Enable debug logging") parser.add_argument( "--recheck", action="store_true", @@ -578,8 +571,7 @@ if __name__ == "__main__": action="store_true", help="Just check new entries from the catalog. Do not re-check existing results.", ) - parser.add_argument( - "--check-format", help="Only check the specified format") + parser.add_argument("--check-format", help="Only check the specified format") parser.add_argument( "--force-check-format", help="Check distributinons with the specified format regardless of previous results", diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2bf1a85025bbf6e630ddd7c203798d71792d520c --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,28 @@ +version: '3.8' +services: + lint: + image: node + command: > + sh -c " + npm install -g markdownlint markdownlint-cli && + markdownlint '**/*.md' --ignore node_modules | tee lint.log + " + volumes: + - .:/app + - /app/node_modules + ruff: + image: python:3.10 + command: > + bash -c " + apt-get update && + apt-get install -y libgdal-dev && + python3 -m pip install --user pipx && + python3 -m pipx ensurepath && + source ~/.bashrc && + pipx install poetry && + poetry install && + poetry run ruff check . + " + volumes: + - .:/app + working_dir: /app diff --git a/formats/atom_format.py b/formats/atom_format.py new file mode 100644 index 0000000000000000000000000000000000000000..3c19d1bfc419ad2fdc457e1eed3f90a5ab5d8110 --- /dev/null +++ b/formats/atom_format.py @@ -0,0 +1,20 @@ +import xml.etree.ElementTree as ET + + +def is_valid(resource, file): + """Check if the HTTP response is an ATOM feed.""" + + with open(file.name, "rb") as f: + try: + xml = ET.parse(f).getroot() + + if xml.tag == "{http://www.w3.org/2005/Atom}feed": + return True + else: + resource["error"] = ( + "Root element is not {http://www.w3.org/2005/Atom}feed" + ) + return False + except Exception as e: + resource["error"] = str(e) + return False diff --git a/formats/docx_format.py b/formats/docx_format.py new file mode 100644 index 0000000000000000000000000000000000000000..3f471a5094c64c0f606bb5b0e1c288f93e5c3790 --- /dev/null +++ b/formats/docx_format.py @@ -0,0 +1,20 @@ +import zipfile + + +def is_valid(resource, file): + """Check if the content is a DOCX file.""" + + if not zipfile.is_zipfile(file.name): + resource["error"] = "Not a ZIP file." + return False + + with zipfile.ZipFile(file.name, "r") as zip_ref: + zip_contents = zip_ref.namelist() + + required_files = ["word/document.xml", "word/styles.xml"] + + if not all(file in zip_contents for file in required_files): + resource["error"] = "That does not look like an DOCX file." + return False + + return True diff --git a/formats/geojson_format.py b/formats/geojson_format.py index 111288f1009e1783279b0238762636849776954a..a4ed2a928e090a42f2c716e39776e85f3396ec61 100644 --- a/formats/geojson_format.py +++ b/formats/geojson_format.py @@ -1,6 +1,4 @@ -import geopandas -from pyogrio.errors import DataSourceError -from shapely.errors import GEOSException +import geojson def is_valid(resource, file): @@ -8,9 +6,11 @@ def is_valid(resource, file): with open(file.name, "rb") as f: try: - geopandas.read_file(f) - return True - except DataSourceError: - return False - except GEOSException: - return False + geojson_data = geojson.load(f) + if isinstance(geojson_data, dict) and "type" in geojson_data: + return True + else: + resource["error"] = "JSON is not GeoJSON." + return False + except Exception as e: + resource["error"] = str(e) diff --git a/formats/geotiff_format.py b/formats/geotiff_format.py new file mode 100644 index 0000000000000000000000000000000000000000..d291e1a13fe318a1b0e2abb4b02438240475a1fa --- /dev/null +++ b/formats/geotiff_format.py @@ -0,0 +1,51 @@ +from osgeo import gdal +import zipfile +import tempfile +import os + + +def is_geotiff(resource, file_name): + dataset = gdal.Open(file_name) + + if not dataset: + resource["error"] = f"could not read file {file_name}" + return False + + geotransform = dataset.GetGeoTransform() + default_transform = (0.0, 1.0, 0.0, 0.0, 0.0, 1.0) + + if geotransform == default_transform: + resource["error"] = "missing transformation" + return False + + return True + + +def is_valid(resource, file): + """Check if the content is a GeoTIFF file.""" + + # Some GeoTIFF files consist for two files in a ZIP file: + # - the TIFF image itself + # - a TFW world file with the transform information + if zipfile.is_zipfile(file.name): + with tempfile.TemporaryDirectory() as temp_dir: + with zipfile.ZipFile(file.name, "r") as zip_ref: + file_list = zip_ref.namelist() + relevant_files = [ + file + for file in file_list + if file.lower().endswith(".tiff") or file.lower().endswith(".tif") + ] + contains_at_least_one_relevant_file = len(relevant_files) > 0 + if contains_at_least_one_relevant_file: + zip_ref.extractall(temp_dir) + for tif_name in relevant_files: + tif_path = os.path.join(temp_dir, tif_name) + if is_geotiff(resource, tif_path): + # the ZIP file contains at least one valid GeoTIFF + return True + else: + resource["error"] = "ZIP file contains not TIFF image" + return False + else: + return is_geotiff(resource, file.name) diff --git a/formats/gml_format.py b/formats/gml_format.py index c74e401c580e933ed45c4e10cf33a0c22666fde7..b0dc4f9b32b096ca196bda5a82ba96d7bec937e5 100644 --- a/formats/gml_format.py +++ b/formats/gml_format.py @@ -1,6 +1,4 @@ import geopandas -from pyogrio.errors import DataSourceError -from shapely.errors import GEOSException def is_valid(resource, file): @@ -10,12 +8,6 @@ def is_valid(resource, file): try: geopandas.read_file(f) return True - except DataSourceError as e: - resource["error"] = str(e) - return False - except GEOSException as e: - resource["error"] = str(e) - return False except Exception as e: resource["error"] = str(e) return False diff --git a/formats/json_format.py b/formats/json_format.py index 5cf52fbfca54ba6ecbaea99c4f06740694316759..41f4e658ba3c2005dad2fd65f7682acaf1216c9a 100644 --- a/formats/json_format.py +++ b/formats/json_format.py @@ -23,9 +23,6 @@ def is_valid(resource, file): return resource["schema_valid"] return True - except json.JSONDecodeError as e: - resource["error"] = str(e) - return False - except UnicodeDecodeError as e: + except Exception as e: resource["error"] = str(e) return False diff --git a/formats/ods_format.py b/formats/ods_format.py new file mode 100644 index 0000000000000000000000000000000000000000..0ff033b3bded0051c0f69c65ff34028e5e7f4374 --- /dev/null +++ b/formats/ods_format.py @@ -0,0 +1,27 @@ +import zipfile + + +def is_valid(resource, file): + """Check if the content is a ODS file.""" + + if not zipfile.is_zipfile(file.name): + resource["error"] = "Not a ZIP file." + return False + + with zipfile.ZipFile(file.name, "r") as zip_ref: + zip_contents = zip_ref.namelist() + + required_files = ["mimetype", "content.xml", "meta.xml", "styles.xml"] + + if not all(file in zip_contents for file in required_files): + resource["error"] = "That does not look like an ODS file." + return False + + with zip_ref.open("mimetype") as mimetype_file: + mimetype_content = mimetype_file.read().decode("utf-8").strip() + + if mimetype_content != "application/vnd.oasis.opendocument.spreadsheet": + resource["error"] = f"Incorrect MIME type: {mimetype_content}" + return False + + return True diff --git a/formats/odt_format.py b/formats/odt_format.py new file mode 100644 index 0000000000000000000000000000000000000000..5cd30402796448db81e94a8f5385237434f6a44f --- /dev/null +++ b/formats/odt_format.py @@ -0,0 +1,27 @@ +import zipfile + + +def is_valid(resource, file): + """Check if the content is a ODT file.""" + + if not zipfile.is_zipfile(file.name): + resource["error"] = "Not a ZIP file." + return False + + with zipfile.ZipFile(file.name, "r") as zip_ref: + zip_contents = zip_ref.namelist() + + required_files = ["mimetype", "content.xml", "meta.xml", "styles.xml"] + + if not all(file in zip_contents for file in required_files): + resource["error"] = "That does not look like an ODT file." + return False + + with zip_ref.open("mimetype") as mimetype_file: + mimetype_content = mimetype_file.read().decode("utf-8").strip() + + if mimetype_content != "application/vnd.oasis.opendocument.text": + resource["error"] = f"Incorrect MIME type: {mimetype_content}" + return False + + return True diff --git a/formats/pdf_format.py b/formats/pdf_format.py index 4a7ee6944ab8a44061219da9a6e5cd40bf7a3717..2c7e9339bfd9c65ff63357df3ec256c752f0398e 100644 --- a/formats/pdf_format.py +++ b/formats/pdf_format.py @@ -1,5 +1,4 @@ from pypdf import PdfReader -from pypdf.errors import PyPdfError def is_valid(resource, file): @@ -9,5 +8,6 @@ def is_valid(resource, file): try: PdfReader(f) return True - except PyPdfError: + except Exception as e: + resource["error"] = str(e) return False diff --git a/formats/png_format.py b/formats/png_format.py index ec3a7344edea208acc28303723163a663f7ae951..c7a9efb9890e01df4494e43ebd4262bac54fa3b9 100644 --- a/formats/png_format.py +++ b/formats/png_format.py @@ -1,4 +1,4 @@ -from PIL import Image, UnidentifiedImageError +from PIL import Image def is_valid(resource, file): @@ -7,5 +7,6 @@ def is_valid(resource, file): try: with Image.open(file.name, formats=["PNG"]): return True - except UnidentifiedImageError: + except Exception as e: + resource["error"] = str(e) return False diff --git a/formats/rdf_format.py b/formats/rdf_format.py new file mode 100644 index 0000000000000000000000000000000000000000..27de8ee1969df75ac51f08b1b425a0c7a9d82b1f --- /dev/null +++ b/formats/rdf_format.py @@ -0,0 +1,19 @@ +from rdflib import Graph + + +def is_valid(resource, file): + """Check if file is a valid RDF document.""" + + try: + graph = Graph() + graph.parse(file.name) + + # even an empty RDF document contains two statements + if len(graph) > 2: + return True + else: + resource["error"] = "RDF document does not contain any statements." + return False + except Exception as e: + resource["error"] = str(e) + return False diff --git a/formats/shp_format.py b/formats/shp_format.py index de42333685da6a31c2bbedfd2336d3f61dea69c1..a13329913d7dc028e54d072233520b2b37cde31a 100644 --- a/formats/shp_format.py +++ b/formats/shp_format.py @@ -1,6 +1,4 @@ import geopandas -from pyogrio.errors import DataSourceError -from shapely.errors import GEOSException import zipfile @@ -24,10 +22,7 @@ def is_valid(resource, file): with open(file.name, "rb") as f: try: geopandas.read_file(f) - except DataSourceError as e: - resource["error"] = str(e) - return False - except GEOSException as e: + except Exception as e: resource["error"] = str(e) return False return True @@ -37,10 +32,7 @@ def is_valid(resource, file): with z.open(shp) as f: try: geopandas.read_file(f"zip://{file.name}!{shp}") - except DataSourceError as e: - resource["error"] = str(e) - return False - except GEOSException as e: + except Exception as e: resource["error"] = str(e) return False return True diff --git a/formats/wfs_srvc_format.py b/formats/wfs_srvc_format.py index bdf788ef03c4c357666dae0d3a54f2d41af60518..9ded4c2de06fece7f5e124ce7abcb4792c206adf 100644 --- a/formats/wfs_srvc_format.py +++ b/formats/wfs_srvc_format.py @@ -12,21 +12,26 @@ def _load_into_file(url): return temp_file -def _is_capabilites_response(file): +def _is_capabilites_response(resource, file): with open(file.name, "rb") as f: try: xml = ET.parse(f).getroot() - return ( + if ( xml.tag == "{http://www.opengis.net/wfs/2.0}WFS_Capabilities" or xml.tag == "{http://www.opengis.net/wfs}WFS_Capabilities" - ) - except ET.ParseError: + ): + return True + else: + resource["error"] = "Root element is not WFS_Capabilities" + return False + except Exception as e: + resource["error"] = str(e) return False def is_valid(resource, file): - if _is_capabilites_response(file): + if _is_capabilites_response(resource, file): return True # The response is not a capabilites XML files. That is allowed. @@ -38,7 +43,12 @@ def is_valid(resource, file): url = url + "?" url = url + "service=WFS&request=GetCapabilities" - return _is_capabilites_response(_load_into_file(url)) + + try: + return _is_capabilites_response(resource, _load_into_file(url)) + except Exception as e: + resource["error"] = str(e) + return False else: # The URL already contains a getCapabilites request but the result was not a correct answer. return False diff --git a/formats/wms_srvc_format.py b/formats/wms_srvc_format.py index 7221d623ea97579e6fb2fb79660eea9a4a9d93c3..13839c8635c4a0fa07008ce1ea217b2096fc805a 100644 --- a/formats/wms_srvc_format.py +++ b/formats/wms_srvc_format.py @@ -12,18 +12,25 @@ def _load_into_file(url): return temp_file -def _is_capabilites_response(file): +def _is_capabilites_response(resource, file): with open(file.name, "rb") as f: try: xml = ET.parse(f).getroot() - return xml.tag == "{http://www.opengis.net/wms}WMS_Capabilities" - except ET.ParseError: + if xml.tag == "{http://www.opengis.net/wms}WMS_Capabilities": + return True + else: + resource["error"] = ( + "Root element is not {http://www.opengis.net/wms}WMS_Capabilities" + ) + return False + except Exception as e: + resource["error"] = str(e) return False def is_valid(resource, file): - if _is_capabilites_response(file): + if _is_capabilites_response(resource, file): return True # The response is not a capabilites XML files. That is allowed. @@ -35,7 +42,12 @@ def is_valid(resource, file): url = url + "?" url = url + "service=WMS&request=GetCapabilities" - return _is_capabilites_response(_load_into_file(url)) + try: + return _is_capabilites_response(resource, _load_into_file(url)) + except Exception as e: + resource["error"] = str(e) + return False + else: # The URL already contains a getCapabilites request but the result was not a correct answer. return False diff --git a/formats/wmts_srvc_format.py b/formats/wmts_srvc_format.py new file mode 100644 index 0000000000000000000000000000000000000000..a27c0936c0fe7c1251eafbc367cbac0ea3d0a2aa --- /dev/null +++ b/formats/wmts_srvc_format.py @@ -0,0 +1,53 @@ +import xml.etree.ElementTree as ET +import requests +import tempfile + + +def _load_into_file(url): + response = requests.get(url) + response.raise_for_status() + + with tempfile.NamedTemporaryFile(delete=False) as temp_file: + temp_file.write(response.content) + return temp_file + + +def _is_capabilites_response(resource, file): + with open(file.name, "rb") as f: + try: + xml = ET.parse(f).getroot() + + if xml.tag == "{http://www.opengis.net/wmts/1.0}Capabilities": + return True + else: + resource["error"] = ( + "Root element is not {http://www.opengis.net/wmts/1.0}WMS_Capabilities" + ) + return False + except Exception as e: + resource["error"] = str(e) + return False + + +def is_valid(resource, file): + if _is_capabilites_response(resource, file): + return True + + # The response is not a capabilites XML files. That is allowed. + # Let's add the request parameters to the URL and try again. + + url = resource["url"] + if "request=" not in url.lower(): + if not url.endswith("?"): + url = url + "?" + + url = url + "service=WMTS&request=GetCapabilities" + try: + return _is_capabilites_response(resource, _load_into_file(url)) + except Exception as e: + resource["error"] = str(e) + return False + + else: + # The URL already contains a getCapabilites request but the result was not a correct answer. + return False diff --git a/formats/xlsx_format.py b/formats/xlsx_format.py new file mode 100644 index 0000000000000000000000000000000000000000..0799403ab350775754e135057de3c779290018d3 --- /dev/null +++ b/formats/xlsx_format.py @@ -0,0 +1,20 @@ +import zipfile + + +def is_valid(resource, file): + """Check if the content is a XLSX file.""" + + if not zipfile.is_zipfile(file.name): + resource["error"] = "Not a ZIP file." + return False + + with zipfile.ZipFile(file.name, "r") as zip_ref: + zip_contents = zip_ref.namelist() + + required_files = ["xl/workbook.xml", "xl/styles.xml"] + + if not all(file in zip_contents for file in required_files): + resource["error"] = "That does not look like an XLSX file." + return False + + return True diff --git a/formats/zip_format.py b/formats/zip_format.py new file mode 100644 index 0000000000000000000000000000000000000000..8de8b6a8fd3b61e9b7360ec2f6cb088512af44a0 --- /dev/null +++ b/formats/zip_format.py @@ -0,0 +1,11 @@ +import zipfile + + +def is_valid(resource, file): + """Check if the file is a ZIP file.""" + + if not zipfile.is_zipfile(file.name): + resource["error"] = "Not a ZIP file." + return False + + return True diff --git a/poetry.lock b/poetry.lock index edac2a2f7c95425967b0d7cdbe7995dda81e460f..830f4af45ded09cbeb5e9fe67d1d89c7d8e22331 100644 --- a/poetry.lock +++ b/poetry.lock @@ -402,6 +402,30 @@ visidata = ["visidata (>=2.10)"] wkt = ["tatsu (>=5.8.3)"] zenodo = ["pyzenodo3 (>=1.0)"] +[[package]] +name = "gdal" +version = "3.6.2" +description = "GDAL: Geospatial Data Abstraction Library" +optional = false +python-versions = ">=3.6.0" +files = [ + {file = "GDAL-3.6.2.tar.gz", hash = "sha256:a167cde1813707d91a938dad1a22f280f5ad28c45980d42e948fb8c59f890f5a"}, +] + +[package.extras] +numpy = ["numpy (>1.0.0)"] + +[[package]] +name = "geojson" +version = "3.2.0" +description = "Python bindings and utilities for GeoJSON" +optional = false +python-versions = ">=3.7" +files = [ + {file = "geojson-3.2.0-py3-none-any.whl", hash = "sha256:69d14156469e13c79479672eafae7b37e2dcd19bdfd77b53f74fa8fe29910b52"}, + {file = "geojson-3.2.0.tar.gz", hash = "sha256:b860baba1e8c6f71f8f5f6e3949a694daccf40820fa8f138b3f712bd85804903"}, +] + [[package]] name = "geopandas" version = "1.0.1" @@ -1784,4 +1808,4 @@ crypto-eth-addresses = ["eth-hash[pycryptodome] (>=0.7.0)"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "5623eece30a81343a4c45540ab21121ad4059f42e96aea0941eb630dc659e791" +content-hash = "28cf723fd433a2a9f0c2d0c5dfd8bbfb6876d8de8b9e7faf93d0e0aeda5c458b" diff --git a/publiccode.yml b/publiccode.yml index 3ccdcbc0f4cd46ffbe4a272f0859dd7c5c494457..f03ca2db48183c2d66ccd4c1ad6756dab4cf0edb 100644 --- a/publiccode.yml +++ b/publiccode.yml @@ -53,7 +53,7 @@ platforms: - windows releaseDate: '2024-12-19' softwareType: standalone/other -softwareVersion: 1.0.0 +softwareVersion: 1.1.0 url: 'https://code.schleswig-holstein.de/opendata/dcat-catalog-check' usedBy: - Open-Data-Portal Schleswig-Holstein diff --git a/pyproject.toml b/pyproject.toml index a17f94b91c4a074bf571a8453351983fe57e0411..e37090ade364fb864c235c0fe0800a28e391527d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [tool.poetry] name = "dcat-catalog-check" -version = "1.0.0" +version = "1.1.0" description = "DCAT Catalog Check" authors = [ "Jesper Zedlitz <jesper.zedlitz@stk.landsh.de>", @@ -21,6 +21,8 @@ pypdf = "^5.1.0" pillow = "^11.0.0" fiona = "^1.10.1" pyarrow = "^18.1.0" +geojson = "^3.2.0" +gdal = "3.6.2" [tool.poetry.group.dev.dependencies] coverage = "^7.6.1" diff --git a/tests/data/Atom_SH_Feldblockfinder_OpenGBD.xml b/tests/data/Atom_SH_Feldblockfinder_OpenGBD.xml new file mode 100644 index 0000000000000000000000000000000000000000..68423ea720a4fc6c02cf2e3f538988171ee01a82 --- /dev/null +++ b/tests/data/Atom_SH_Feldblockfinder_OpenGBD.xml @@ -0,0 +1,62 @@ +<?xml version="1.0" encoding="UTF-8"?> +<feed xmlns:georss="http://www.georss.org/georss" xmlns:inspire_dls="http://inspire.ec.europa.eu/schemas/inspire_dls/1.0" xmlns:lang="ger" xmlns="http://www.w3.org/2005/Atom"> + <title>ATOMFeed SH Feldblockfinder Schleswig-Holstein (Downloaddienst)</title> + <subtitle>Downloaddienst zur Bereitstellung der Feldblöcke, Landschaftselemente, GLÖZ 2- und Nitratkulisse in Schleswig-Holstein.</subtitle> + <link href="http://sh-mis.schleswig-holstein.de/soapServices/CSWStartup?service=CSW&version=2.0.2&request=GetRecordById&outputschema=http://www.isotc211.org/2005/gmd&elementsetname=full&ID=cc68aa82-d71b-42bb-b5ce-7b850486a842" rel="describedby" type="application/xml"/> + <link href="https://service.gdi-sh.de/SH_OpenGBD/feeds/Atom_SH_Feldblockfinder_OpenGBD/Atom_SH_Feldblockfinder_OpenGBD.xml" rel="self" type="application/atom+xml" hreflang="ger" title="Selbstreferenz"/> + <link href="https://service.gdi-sh.de/SH_OpenGBD/feeds/Atom_SH_Feldblockfinder_OpenGBD/Atom_SH_Feldblockfinder_OpenGBD_OSD.xml" rel="search" type="application/opensearchdescription+xml" title="Open Search Beschreibung ATOMFeed SH Feldblockfinder Schleswig-Holstein (Downloaddienst)"/> + <id>https://service.gdi-sh.de/SH_OpenGBD/feeds/Atom_SH_Feldblockfinder_OpenGBD/Atom_SH_Feldblockfinder_OpenGBD.xml</id> + <updated>2023-08-22T12:00:00+01:00</updated> + <author> + <name>Landesamt für Vermessung und Geoinformation Schleswig-Holstein (Servicestelle DANord)</name> + <email>DANord@LVermGeo.landsh.de</email> + </author> + <entry> + <title>Feldblöcke SH 2024</title> + <inspire_dls:spatial_dataset_identifier_code>21f67269-780f-4f3c-8f66-03dde27acfe7</inspire_dls:spatial_dataset_identifier_code> + <inspire_dls:spatial_dataset_identifier_namespace>http://registry.gdi.de.org/id/de.sh/</inspire_dls:spatial_dataset_identifier_namespace> + <link rel="describedby" href="http://sh-mis.schleswig-holstein.de/soapServices/CSWStartup?service=CSW&version=2.0.2&request=GetRecordById&outputschema=http://www.isotc211.org/2005/gmd&elementsetname=full&ID=21f67269-780f-4f3c-8f66-03dde27acfe7" type="application/xml"/> + <link rel="alternate" href="https://service.gdi-sh.de/SH_OpenGBD/feeds/Atom_SH_Feldblockfinder_OpenGBD/Atom_SH_Feldblockfinder_OpenGBD_21f67269-780f-4f3c-8f66-03dde27acfe7.xml" type="application/atom+xml" hreflang="ger" title="Feldblöcke SH 2024"/> + <id>https://service.gdi-sh.de/SH_OpenGBD/feeds/Atom_SH_Feldblockfinder_OpenGBD/Atom_SH_Feldblockfinder_OpenGBD_21f67269-780f-4f3c-8f66-03dde27acfe7.xml</id> + <updated>2023-08-22T12:00:00+01:00</updated> + <summary>Feldblöcke in Schleswig-Holstein für den Stichtag 01.01.2024.</summary> + <georss:polygon>53.35 7.86 55.05 7.86 55.05 11.35 53.35 11.35 53.35 7.86</georss:polygon> + <category term="http://www.opengis.net/def/crs/EPSG/0/4258" label="ETRS89"/> + </entry> + <entry> + <title>Landschaftselemente SH 2024</title> + <inspire_dls:spatial_dataset_identifier_code>c4ddc5b7-036c-4670-b5ed-445d1aa20a2a</inspire_dls:spatial_dataset_identifier_code> + <inspire_dls:spatial_dataset_identifier_namespace>http://registry.gdi.de.org/id/de.sh/</inspire_dls:spatial_dataset_identifier_namespace> + <link rel="describedby" href="http://sh-mis.schleswig-holstein.de/soapServices/CSWStartup?service=CSW&version=2.0.2&request=GetRecordById&outputschema=http://www.isotc211.org/2005/gmd&elementsetname=full&ID=c4ddc5b7-036c-4670-b5ed-445d1aa20a2a" type="application/xml"/> + <link rel="alternate" href="https://service.gdi-sh.de/SH_OpenGBD/feeds/Atom_SH_Feldblockfinder_OpenGBD/Atom_SH_Feldblockfinder_OpenGBD_c4ddc5b7-036c-4670-b5ed-445d1aa20a2a.xml" type="application/atom+xml" hreflang="ger" title="Landschaftselemente SH 2024"/> + <id>https://service.gdi-sh.de/SH_OpenGBD/feeds/Atom_SH_Feldblockfinder_OpenGBD/Atom_SH_Feldblockfinder_OpenGBD_c4ddc5b7-036c-4670-b5ed-445d1aa20a2a.xml</id> + <updated>2023-08-22T12:00:00+01:00</updated> + <summary>Landschaftselemente in Schleswig Holstein zum Stichtag 01.01.2024.</summary> + <georss:polygon>53.35 7.86 55.05 7.86 55.05 11.35 53.35 11.35 53.35 7.86</georss:polygon> + <category term="http://www.opengis.net/def/crs/EPSG/0/4258" label="ETRS89"/> + </entry> + <entry> + <title>Feuchtgebiete und Moore ab 2 ha (GLÖZ 2) SH</title> + <inspire_dls:spatial_dataset_identifier_code>176bcfa1-7af2-4bd5-b4e3-2a6212b8fcd2</inspire_dls:spatial_dataset_identifier_code> + <inspire_dls:spatial_dataset_identifier_namespace>http://registry.gdi.de.org/id/de.sh/</inspire_dls:spatial_dataset_identifier_namespace> + <link rel="describedby" href="http://sh-mis.schleswig-holstein.de/soapServices/CSWStartup?service=CSW&version=2.0.2&request=GetRecordById&outputschema=http://www.isotc211.org/2005/gmd&elementsetname=full&ID=176bcfa1-7af2-4bd5-b4e3-2a6212b8fcd2" type="application/xml"/> + <link rel="alternate" href="https://service.gdi-sh.de/SH_OpenGBD/feeds/Atom_SH_Feldblockfinder_OpenGBD/Atom_SH_Feldblockfinder_OpenGBD_176bcfa1-7af2-4bd5-b4e3-2a6212b8fcd2.xml" type="application/atom+xml" hreflang="ger" title="Feuchtgebiete und Moore ab 2 ha (GLÖZ 2) SH"/> + <id>https://service.gdi-sh.de/SH_OpenGBD/feeds/Atom_SH_Feldblockfinder_OpenGBD/Atom_SH_Feldblockfinder_OpenGBD_176bcfa1-7af2-4bd5-b4e3-2a6212b8fcd2.xml</id> + <updated>2023-08-18T12:00:00+01:00</updated> + <summary>Feuchtgebiete und Moore ab 2 ha (GLÖZ 2) in Schleswig-Holstein.</summary> + <georss:polygon>11.35 7.86 55.05 7.86 55.05 53.35 11.35 53.35 11.35 7.86</georss:polygon> + <category term="http://www.opengis.net/def/crs/EPSG/0/4258" label="ETRS89"/> + </entry> + <entry> + <title>Nitrat-belastete Gebiete nach LDüV in SH</title> + <inspire_dls:spatial_dataset_identifier_code>694bd2d9-3bea-4818-961c-25024acc0588</inspire_dls:spatial_dataset_identifier_code> + <inspire_dls:spatial_dataset_identifier_namespace>http://registry.gdi.de.org/id/de.sh/</inspire_dls:spatial_dataset_identifier_namespace> + <link rel="describedby" href="http://sh-mis.schleswig-holstein.de/soapServices/CSWStartup?service=CSW&version=2.0.2&request=GetRecordById&outputschema=http://www.isotc211.org/2005/gmd&elementsetname=full&ID=694bd2d9-3bea-4818-961c-25024acc0588" type="application/xml"/> + <link rel="alternate" href="https://service.gdi-sh.de/SH_OpenGBD/feeds/Atom_SH_Feldblockfinder_OpenGBD/Atom_SH_Feldblockfinder_OpenGBD_694bd2d9-3bea-4818-961c-25024acc0588.xml" type="application/atom+xml" hreflang="ger" title="Nitrat-belastete Gebiete nach LDüV in SH"/> + <id>https://service.gdi-sh.de/SH_OpenGBD/feeds/Atom_SH_Feldblockfinder_OpenGBD/Atom_SH_Feldblockfinder_OpenGBD_694bd2d9-3bea-4818-961c-25024acc0588.xml</id> + <updated>2023-08-18T12:00:00+01:00</updated> + <summary>Nitrat-belastete Gebiete für das Jahr 2022 nach LDüV in SH.</summary> + <georss:polygon>11.35 7.86 55.05 7.86 55.05 53.35 11.35 53.35 11.35 7.86</georss:polygon> + <category term="http://www.opengis.net/def/crs/EPSG/0/4258" label="ETRS89"/> + </entry> +</feed> diff --git a/tests/data/WMTSCapabilities.xml b/tests/data/WMTSCapabilities.xml new file mode 100644 index 0000000000000000000000000000000000000000..a9d19dfdc9692770e5cd0430ae73670e251e631f --- /dev/null +++ b/tests/data/WMTSCapabilities.xml @@ -0,0 +1,96 @@ +<?xml version="1.0"?> +<Capabilities xmlns="http://www.opengis.net/wmts/1.0" xmlns:ows="http://www.opengis.net/ows/1.1" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:gml="http://www.opengis.net/gml" xsi:schemaLocation="http://www.opengis.net/wmts/1.0 http://schemas.opengis.net/wmts/1.0/wmtsGetCapabilities_response.xsd" version="1.0.0"> + <ows:ServiceIdentification> + <ows:Title>WMTS_SH_ALKIS</ows:Title> + <ows:Abstract>Flächendeckende Beschreibung der Angaben zu den Layern "Flurstücke", "Gebäude" sowie zu den Gruppierungen "Tatsächliche Nutzung" und "Gesetzliche Festlegungen" gemäß der entsprechenden Objektbereiche im ALKIS-Objektartenkatalog. Die Gruppierung "Weiteres" ist optional und enthält die Objektbereiche "Bauwerke und Einrichtungen" sowie "Relief". Alle ALKIS-Objekte des Grunddatenbestandes (ausser Grenzpunkte und Netzpunkte) sind Pflichtinhalte. Alle weiteren ALKIS-Objekte können optional geführt werden. Die Präsentation der ALKIS-Daten erfolgt grundsätzlich nach dem ALKIS-Signaturenkatalog für AdV-Standardausgaben. Soweit im Signaturenkatalog festgelegt, stehen für alle Layer Darstellungen in Farbe zur Verfügung. Für "Flurstücke" und "Gebäude" werden zusätzlich Darstellungen in Grausstufen (entsprechend Signaturenkatalog) und in Gelb (keine Flächendarstellung, nur Konturen) angeboten.</ows:Abstract> + <ows:Keywords> + <ows:Keyword>WMS</ows:Keyword> + <ows:Keyword>Landesamt für Vermessung ung Geoinformation Schleswig-Holstein</ows:Keyword> + <ows:Keyword>LVermGeo SH</ows:Keyword> + <ows:Keyword>AdV</ows:Keyword> + <ows:Keyword>ALKIS</ows:Keyword> + <ows:Keyword>opendata</ows:Keyword> + </ows:Keywords> + <ows:ServiceType>OGC WMTS</ows:ServiceType> + <ows:ServiceTypeVersion>1.0.0</ows:ServiceTypeVersion> + <ows:Fees>Für die Nutzung der Daten ist die Creative Commons (CC BY 4.0) – Namensnennung 4.0 International anzuwenden. Die Lizenz ist über http://creativecommons.org/licenses/by/4.0 abrufbar. Der Quellenvermerk lautet "© GeoBasis-DE/LVermGeo SH/CC BY 4.0" ||{"id":"cc-by/4.0","name":"Creative Commons Namensnennung – 4.0 International (CC BY 4.0)","url":"http://creativecommons.org/licenses/by/4.0/","quelle":"© GeoBasis-DE/LVermGeo SH/CC BY 4.0"}</ows:Fees> + <ows:AccessConstraints>NONE</ows:AccessConstraints> + </ows:ServiceIdentification> + <ows:ServiceProvider> + <ows:ProviderName>Landesamt für Vermessung und Geoinformation Schleswig-Holstein (LVermGeo SH)</ows:ProviderName> + <ows:ProviderSite xlink:href="http://www.schleswig-holstein.de/DE/Landesregierung/LVERMGEOSH/lvermgeosh_node.html"/> + <ows:ServiceContact> + <ows:IndividualName>Servicestelle Geoserver</ows:IndividualName> + <ows:PositionName></ows:PositionName> + <ows:ContactInfo> + <ows:Phone> + <ows:Voice>+49 (0)431 383-2019</ows:Voice> + <ows:Facsimile>+49 (0)431 988624-2019</ows:Facsimile> + </ows:Phone> + <ows:Address> + <ows:DeliveryPoint>Landesamt für Vermessung und Geoinformation Schleswig-Holstein (LVermGeo SH)</ows:DeliveryPoint> + <ows:City>Kiel</ows:City> + <ows:PostalCode>24106</ows:PostalCode> + <ows:Country>Germany</ows:Country> + <ows:ElectronicMailAddress>Geoserver@LVermGeo.landsh.de</ows:ElectronicMailAddress> + </ows:Address> + </ows:ContactInfo> + </ows:ServiceContact> + </ows:ServiceProvider> + <Contents> + <Layer> + <ows:Title>SH_ALKIS</ows:Title> + <ows:Abstract></ows:Abstract> + <ows:WGS84BoundingBox> + <ows:LowerCorner>0.10594674240568917 45.237542736025574</ows:LowerCorner> + <ows:UpperCorner>20.448891294525673 56.84787345153812</ows:UpperCorner> + </ows:WGS84BoundingBox> + <ows:Identifier>SH_ALKIS</ows:Identifier> + <Style> + <ows:Identifier>default</ows:Identifier> + <LegendURL + format="image/png" + xlink:href="https://dienste.gdi-sh.de//WMTS_SH_ALKIS_OpenGBD/service?service=WMS&request=GetLegendGraphic&version=1.3.0&format=image%2Fpng&layer=SH_ALKIS" + /> + </Style> + <Format>image/png</Format> + <TileMatrixSetLink> + <TileMatrixSet>DE_EPSG_25832_ADV</TileMatrixSet> + </TileMatrixSetLink> + <ResourceURL format="image/png" resourceType="tile" + template="https://dienste.gdi-sh.de//WMTS_SH_ALKIS_OpenGBD/wmts/SH_ALKIS/{TileMatrixSet}/{TileMatrix}/{TileCol}/{TileRow}.png"/> + </Layer> + <TileMatrixSet> + <ows:Identifier>DE_EPSG_25832_ADV</ows:Identifier> + <ows:SupportedCRS>EPSG:25832</ows:SupportedCRS> + <TileMatrix> + <ows:Identifier>00</ows:Identifier> + <ScaleDenominator>4265.4591676995715</ScaleDenominator> + <TopLeftCorner>-46133.17 6301219.54</TopLeftCorner> + <TileWidth>256</TileWidth> + <TileHeight>256</TileHeight> + <MatrixWidth>4096</MatrixWidth> + <MatrixHeight>4096</MatrixHeight> + </TileMatrix> + <TileMatrix> + <ows:Identifier>01</ows:Identifier> + <ScaleDenominator>2132.729583849782</ScaleDenominator> + <TopLeftCorner>-46133.17 6301219.54</TopLeftCorner> + <TileWidth>256</TileWidth> + <TileHeight>256</TileHeight> + <MatrixWidth>8192</MatrixWidth> + <MatrixHeight>8192</MatrixHeight> + </TileMatrix> + <TileMatrix> + <ows:Identifier>02</ows:Identifier> + <ScaleDenominator>1066.3647919248929</ScaleDenominator> + <TopLeftCorner>-46133.17 6301219.54</TopLeftCorner> + <TileWidth>256</TileWidth> + <TileHeight>256</TileHeight> + <MatrixWidth>16384</MatrixWidth> + <MatrixHeight>16384</MatrixHeight> + </TileMatrix> + </TileMatrixSet> + </Contents> + <ServiceMetadataURL xlink:href="https://dienste.gdi-sh.de//WMTS_SH_ALKIS_OpenGBD/wmts/1.0.0/WMTSCapabilities.xml"/> +</Capabilities> \ No newline at end of file diff --git a/tests/data/all-tests.ttl b/tests/data/all-tests.ttl new file mode 100644 index 0000000000000000000000000000000000000000..9a9ae7324579c305f3a79bf08d82b98c852c41c6 --- /dev/null +++ b/tests/data/all-tests.ttl @@ -0,0 +1,109 @@ +@prefix dcat: <http://www.w3.org/ns/dcat#> . +@prefix dcterms: <http://purl.org/dc/terms/> . +@prefix foaf: <http://xmlns.com/foaf/0.1/> . + +<https://example.org/dataset/c0b08e27-c2a1-4602-83fc-961f0f50190c> a dcat:Dataset ; + dcterms:identifier "c0b08e27-c2a1-4602-83fc-961f0f50190c" ; + dcterms:title "all test files" ; + dcterms:publisher [ + a foaf:Organization; + foaf:name "Test" ; + ] ; + dcat:distribution [ a dcat:Distribution ; + dcterms:format <http://publications.europa.eu/resource/authority/file-type/ATOM> ; + dcat:downloadURL <http://localhost:8000/tests/data/Atom_SH_Feldblockfinder_OpenGBD.xml> ], + [ a dcat:Distribution ; + dcterms:format <http://publications.europa.eu/resource/authority/file-type/TXT> ; + dcat:downloadURL <http://localhost:8000/tests/data/text.txt> ], + [ a dcat:Distribution ; + dcterms:format <http://publications.europa.eu/resource/authority/file-type/CSV> ; + dcat:downloadURL <http://localhost:8000/tests/data/ufo.csv> ], + [ a dcat:Distribution ; + dcterms:format <http://publications.europa.eu/resource/authority/file-type/GEOJSON> ; + dcat:downloadURL <http://localhost:8000/tests/data/bermuda.geojson> ], + [ a dcat:Distribution ; + dcterms:format <http://publications.europa.eu/resource/authority/file-type/JSON> ; + dcat:downloadURL <http://localhost:8000/tests/data/ufo-resource.json> ], + [ a dcat:Distribution ; + dcterms:format <http://publications.europa.eu/resource/authority/file-type/JSON> ; + dcat:downloadURL <http://localhost:8000/tests/data/ufo-schema.json> ], + [ a dcat:Distribution ; + dcterms:format <http://publications.europa.eu/resource/authority/file-type/RDF> ; + dcat:downloadURL <http://localhost:8000/tests/data/ufo.ttl> ], + [ a dcat:Distribution ; + dcterms:format <http://publications.europa.eu/resource/authority/file-type/DOCX> ; + dcat:downloadURL <http://localhost:8000/tests/data/valid.docx> ], + [ a dcat:Distribution ; + dcterms:format <http://publications.europa.eu/resource/authority/file-type/GEOTIFF> ; + dcat:downloadURL <http://localhost:8000/tests/data/valid_geotiff.tif> ], + [ a dcat:Distribution ; + dcterms:format <http://publications.europa.eu/resource/authority/file-type/ODS> ; + dcat:downloadURL <http://localhost:8000/tests/data/valid.ods> ], + [ a dcat:Distribution ; + dcterms:format <http://publications.europa.eu/resource/authority/file-type/ODT> ; + dcat:downloadURL <http://localhost:8000/tests/data/valid.odt> ], + [ a dcat:Distribution ; + dcterms:format <http://publications.europa.eu/resource/authority/file-type/XML> ; + dcat:downloadURL <http://localhost:8000/tests/data/correct.xml> ], + [ a dcat:Distribution ; + dcterms:format <http://publications.europa.eu/resource/authority/file-type/PARQUET> ; + dcat:downloadURL <http://localhost:8000/tests/data/valid.parquet> ], + [ a dcat:Distribution ; + dcterms:format <http://publications.europa.eu/resource/authority/file-type/TIFF> ; + dcat:downloadURL <http://localhost:8000/tests/data/valid.tif> ], + [ a dcat:Distribution ; + dcterms:format <http://publications.europa.eu/resource/authority/file-type/XLSX> ; + dcat:downloadURL <http://localhost:8000/tests/data/valid.xlsx> ], + [ a dcat:Distribution ; + dcterms:format <http://publications.europa.eu/resource/authority/file-type/GML> ; + dcat:downloadURL <http://localhost:8000/tests/data/bermuda.gml> ], + [ a dcat:Distribution ; + dcterms:format <http://publications.europa.eu/resource/authority/file-type/WMTS_SRVC> ; + dcat:downloadURL <http://localhost:8000/tests/data/WMTSCapabilities.xml> ], + [ a dcat:Distribution ; + dcterms:format <http://publications.europa.eu/resource/authority/file-type/SHP> ; + dcat:downloadURL <http://localhost:8000/tests/data/zos116.zip> ], + [ a dcat:Distribution ; + dcterms:format <http://publications.europa.eu/resource/authority/file-type/SHP> ; + dcat:downloadURL <http://localhost:8000/tests/data/bermuda-with-subdir.zip> ], + [ a dcat:Distribution ; + dcterms:format <http://publications.europa.eu/resource/authority/file-type/SHP> ; + dcat:downloadURL <http://localhost:8000/tests/data/bermuda.zip> ], + [ a dcat:Distribution ; + dcterms:format <http://publications.europa.eu/resource/authority/file-type/JSON> ; + dcat:downloadURL <http://localhost:8000/tests/data/correct.json> ], + [ a dcat:Distribution ; + dcterms:format <http://publications.europa.eu/resource/authority/file-type/JSON> ; + dcat:downloadURL <http://localhost:8000/tests/data/correct.json.bz2> ], + [ a dcat:Distribution ; + dcterms:format <http://publications.europa.eu/resource/authority/file-type/GEOTIFF> ; + dcat:downloadURL <http://localhost:8000/tests/data/geotiff.zip> ], + [ a dcat:Distribution ; + dcterms:format <http://publications.europa.eu/resource/authority/file-type/JSON> ; + dcat:compressFormat <http://www.iana.org/assignments/media-types/application/gzip> ; + dcat:downloadURL <http://localhost:8000/tests/data/correct.json.gz> ], + [ a dcat:Distribution ; + dcterms:format <http://publications.europa.eu/resource/authority/file-type/JSON> ; + dcat:downloadURL <http://localhost:8000/tests/data/correct.json.xz> ], + [ a dcat:Distribution ; + dcterms:format <http://publications.europa.eu/resource/authority/file-type/PNG> ; + dcat:downloadURL <http://localhost:8000/tests/data/image.png> ], + [ a dcat:Distribution ; + dcterms:format <http://publications.europa.eu/resource/authority/file-type/JSON> ; + dcat:downloadURL <http://localhost:8000/tests/data/json-in-zip.zip> ; + dcat:packageFormat <http://publications.europa.eu/resource/authority/file-type/ZIP> ], + [ a dcat:Distribution ; + dcterms:format <http://publications.europa.eu/resource/authority/file-type/JSON> ; + dcat:downloadURL <http://localhost:8000/tests/data/jsons-in-zip.zip> ; + dcat:packageFormat <http://publications.europa.eu/resource/authority/file-type/ZIP> ], + [ a dcat:Distribution ; + dcterms:format <http://publications.europa.eu/resource/authority/file-type/PNG> ; + dcat:downloadURL <http://localhost:8000/tests/data/png-in-zip.zip> ; + dcat:packageFormat <http://publications.europa.eu/resource/authority/file-type/ZIP> ], + [ a dcat:Distribution ; + dcterms:format <http://publications.europa.eu/resource/authority/file-type/RDF> ; + dcat:downloadURL <http://localhost:8000/tests/data/rdf.json> ], + [ a dcat:Distribution ; + dcterms:format <http://publications.europa.eu/resource/authority/file-type/RDF> ; + dcat:downloadURL <http://localhost:8000/tests/data/rdf.xml> ] . + diff --git a/tests/data/geotiff.zip b/tests/data/geotiff.zip new file mode 100644 index 0000000000000000000000000000000000000000..a668e2664f6c3b8f91841427bd8e0d9c3652d67c Binary files /dev/null and b/tests/data/geotiff.zip differ diff --git a/tests/data/image.jpeg b/tests/data/image.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..3df13ea92d222614d298eb082b9ebae865f880d1 Binary files /dev/null and b/tests/data/image.jpeg differ diff --git a/tests/data/rdf.json b/tests/data/rdf.json new file mode 100644 index 0000000000000000000000000000000000000000..21a9c10f88d2b005a0196595572967b7c5b05066 --- /dev/null +++ b/tests/data/rdf.json @@ -0,0 +1,273 @@ +[ + { + "@id": "https://example.org/dataset/87e42608-769f-4ca8-8593-7546a027b2b8", + "@type": [ + "http://www.w3.org/ns/dcat#Dataset" + ], + "http://purl.org/dc/terms/accessRights": [ + { + "@id": "http://publications.europa.eu/resource/authority/access-right/PUBLIC" + } + ], + "http://purl.org/dc/terms/description": [ + { + "@value": "Anzahl täglicher Landungen und Starts unbekannter Flugobjekte (UFOs) in Schleswig-Holstein. 🛸👽\n##Methodik\nGezählt werden nur die Landungen und Starts von UFOs, die gemeldet und zusätzlich offiziell bestätigt wurden. Sichtungen, die zu keinem Bodenkontakt führen, werden nicht gezählt.\n##Attribute\n- `datum` - Datum\n- `ufo_landungen` - Anzahl UFO-Landungen\n- `ufo_starts` - Anzahl UFO-Starts\n" + } + ], + "http://purl.org/dc/terms/identifier": [ + { + "@value": "87e42608-769f-4ca8-8593-7546a027b2b8" + } + ], + "http://purl.org/dc/terms/issued": [ + { + "@type": "http://www.w3.org/2001/XMLSchema#dateTime", + "@value": "2024-06-18T07:20:05.693344" + } + ], + "http://purl.org/dc/terms/license": [ + { + "@id": "http://dcat-ap.de/def/licenses/cc-zero" + } + ], + "http://purl.org/dc/terms/modified": [ + { + "@type": "http://www.w3.org/2001/XMLSchema#dateTime", + "@value": "2024-06-18T07:20:05.693344" + } + ], + "http://purl.org/dc/terms/publisher": [ + { + "@id": "https://example.org/organization/ufo-kontrolle" + } + ], + "http://purl.org/dc/terms/spatial": [ + { + "@id": "http://dcat-ap.de/def/politicalGeocoding/stateKey/01" + } + ], + "http://purl.org/dc/terms/temporal": [ + { + "@id": "_:n1fa3c2476143497285348e0c39705837b1" + } + ], + "http://purl.org/dc/terms/title": [ + { + "@value": "Bestätigte UFO-Landungen und -Starts" + } + ], + "http://www.w3.org/ns/dcat#distribution": [ + { + "@id": "_:n1fa3c2476143497285348e0c39705837b4" + }, + { + "@id": "_:n1fa3c2476143497285348e0c39705837b2" + } + ], + "http://www.w3.org/ns/dcat#keyword": [ + { + "@value": "Weltall" + }, + { + "@value": "Start" + }, + { + "@value": "Landung" + }, + { + "@value": "Raumschiff" + }, + { + "@value": "UFO" + }, + { + "@value": "Testdaten" + } + ], + "http://www.w3.org/ns/dcat#theme": [ + { + "@id": "http://publications.europa.eu/resource/authority/data-theme/INTL" + } + ] + }, + { + "@id": "_:n1fa3c2476143497285348e0c39705837b4", + "@type": [ + "http://www.w3.org/ns/dcat#Distribution" + ], + "http://purl.org/dc/terms/format": [ + { + "@id": "http://publications.europa.eu/resource/authority/file-type/JSON" + } + ], + "http://purl.org/dc/terms/issued": [ + { + "@type": "http://www.w3.org/2001/XMLSchema#dateTime", + "@value": "2024-06-18T05:20:07.232559" + } + ], + "http://purl.org/dc/terms/license": [ + { + "@id": "http://dcat-ap.de/def/licenses/cc-zero" + } + ], + "http://purl.org/dc/terms/modified": [ + { + "@type": "http://www.w3.org/2001/XMLSchema#dateTime", + "@value": "2024-06-18T05:20:07.191976" + } + ], + "http://purl.org/dc/terms/rights": [ + { + "@id": "http://dcat-ap.de/def/licenses/cc-zero" + } + ], + "http://purl.org/dc/terms/title": [ + { + "@value": "Frictionless Data Resource" + } + ], + "http://spdx.org/rdf/terms#checksum": [ + { + "@id": "_:n1fa3c2476143497285348e0c39705837b5" + } + ], + "http://www.w3.org/ns/dcat#accessURL": [ + { + "@id": "http://localhost:8000/ufo-resource.json" + } + ], + "http://www.w3.org/ns/dcat#byteSize": [ + { + "@type": "http://www.w3.org/2001/XMLSchema#integer", + "@value": 487 + } + ], + "http://www.w3.org/ns/dcat#downloadURL": [ + { + "@id": "http://localhost:8000/ufo-resource.json" + } + ], + "http://www.w3.org/ns/dcat#mediaType": [ + { + "@id": "https://www.iana.org/assignments/media-types/application/csv" + } + ] + }, + { + "@id": "_:n1fa3c2476143497285348e0c39705837b5", + "@type": [ + "http://spdx.org/rdf/terms#Checksum" + ], + "http://spdx.org/rdf/terms#algorithm": [ + { + "@id": "http://spdx.org/rdf/terms#checksumAlgorithm_md5" + } + ], + "http://spdx.org/rdf/terms#checksumValue": [ + { + "@type": "http://www.w3.org/2001/XMLSchema#hexBinary", + "@value": "8dca8b179bbe0d46c5004da5112f6c4c" + } + ] + }, + { + "@id": "_:n1fa3c2476143497285348e0c39705837b2", + "@type": [ + "http://www.w3.org/ns/dcat#Distribution" + ], + "http://purl.org/dc/terms/format": [ + { + "@id": "http://publications.europa.eu/resource/authority/file-type/CSV" + } + ], + "http://purl.org/dc/terms/issued": [ + { + "@type": "http://www.w3.org/2001/XMLSchema#dateTime", + "@value": "2024-06-18T05:20:07.232559" + } + ], + "http://purl.org/dc/terms/license": [ + { + "@id": "http://dcat-ap.de/def/licenses/cc-zero" + } + ], + "http://purl.org/dc/terms/modified": [ + { + "@type": "http://www.w3.org/2001/XMLSchema#dateTime", + "@value": "2024-06-18T05:20:07.191976" + } + ], + "http://purl.org/dc/terms/rights": [ + { + "@id": "http://dcat-ap.de/def/licenses/cc-zero" + } + ], + "http://purl.org/dc/terms/title": [ + { + "@value": "ufo.csv" + } + ], + "http://spdx.org/rdf/terms#checksum": [ + { + "@id": "_:n1fa3c2476143497285348e0c39705837b3" + } + ], + "http://www.w3.org/ns/dcat#accessURL": [ + { + "@id": "http://localhost:8000/ufo.csv" + } + ], + "http://www.w3.org/ns/dcat#byteSize": [ + { + "@type": "http://www.w3.org/2001/XMLSchema#integer", + "@value": 151 + } + ], + "http://www.w3.org/ns/dcat#downloadURL": [ + { + "@id": "http://localhost:8000/ufo.csv" + } + ], + "http://www.w3.org/ns/dcat#mediaType": [ + { + "@id": "https://www.iana.org/assignments/media-types/application/csv" + } + ] + }, + { + "@id": "_:n1fa3c2476143497285348e0c39705837b3", + "@type": [ + "http://spdx.org/rdf/terms#Checksum" + ], + "http://spdx.org/rdf/terms#algorithm": [ + { + "@id": "http://spdx.org/rdf/terms#checksumAlgorithm_sha1" + } + ], + "http://spdx.org/rdf/terms#checksumValue": [ + { + "@type": "http://www.w3.org/2001/XMLSchema#hexBinary", + "@value": "3ffba0a43d3497a7918b376a335c31fbecc9325b" + } + ] + }, + { + "@id": "_:n1fa3c2476143497285348e0c39705837b1", + "@type": [ + "http://purl.org/dc/terms/PeriodOfTime" + ], + "http://www.w3.org/ns/dcat#endDate": [ + { + "@type": "http://www.w3.org/2001/XMLSchema#date", + "@value": "2024-06-17" + } + ], + "http://www.w3.org/ns/dcat#startDate": [ + { + "@type": "http://www.w3.org/2001/XMLSchema#date", + "@value": "2024-06-10" + } + ] + } +] \ No newline at end of file diff --git a/tests/data/rdf.xml b/tests/data/rdf.xml new file mode 100644 index 0000000000000000000000000000000000000000..3e6690e5ad4d230b9dae318093040213914810bd --- /dev/null +++ b/tests/data/rdf.xml @@ -0,0 +1,80 @@ +<?xml version="1.0" encoding="utf-8"?> +<rdf:RDF + xmlns:dcat="http://www.w3.org/ns/dcat#" + xmlns:dcterms="http://purl.org/dc/terms/" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:spdx="http://spdx.org/rdf/terms#" +> + <rdf:Description rdf:about="https://example.org/dataset/87e42608-769f-4ca8-8593-7546a027b2b8"> + <rdf:type rdf:resource="http://www.w3.org/ns/dcat#Dataset"/> + <dcterms:accessRights rdf:resource="http://publications.europa.eu/resource/authority/access-right/PUBLIC"/> + <dcterms:description>Anzahl täglicher Landungen und Starts unbekannter Flugobjekte (UFOs) in Schleswig-Holstein. 🛸👽 +##Methodik +Gezählt werden nur die Landungen und Starts von UFOs, die gemeldet und zusätzlich offiziell bestätigt wurden. Sichtungen, die zu keinem Bodenkontakt führen, werden nicht gezählt. +##Attribute +- `datum` - Datum +- `ufo_landungen` - Anzahl UFO-Landungen +- `ufo_starts` - Anzahl UFO-Starts +</dcterms:description> + <dcterms:identifier>87e42608-769f-4ca8-8593-7546a027b2b8</dcterms:identifier> + <dcterms:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2024-06-18T07:20:05.693344</dcterms:issued> + <dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2024-06-18T07:20:05.693344</dcterms:modified> + <dcterms:license rdf:resource="http://dcat-ap.de/def/licenses/cc-zero"/> + <dcterms:publisher rdf:resource="https://example.org/organization/ufo-kontrolle"/> + <dcterms:spatial rdf:resource="http://dcat-ap.de/def/politicalGeocoding/stateKey/01"/> + <dcterms:temporal rdf:nodeID="n6747fd43db2143cca14c39970555b181b1"/> + <dcterms:title>Bestätigte UFO-Landungen und -Starts</dcterms:title> + <dcat:distribution rdf:nodeID="n6747fd43db2143cca14c39970555b181b2"/> + <dcat:distribution rdf:nodeID="n6747fd43db2143cca14c39970555b181b4"/> + <dcat:keyword>UFO</dcat:keyword> + <dcat:keyword>Landung</dcat:keyword> + <dcat:keyword>Start</dcat:keyword> + <dcat:keyword>Raumschiff</dcat:keyword> + <dcat:keyword>Weltall</dcat:keyword> + <dcat:keyword>Testdaten</dcat:keyword> + <dcat:theme rdf:resource="http://publications.europa.eu/resource/authority/data-theme/INTL"/> + </rdf:Description> + <rdf:Description rdf:nodeID="n6747fd43db2143cca14c39970555b181b4"> + <rdf:type rdf:resource="http://www.w3.org/ns/dcat#Distribution"/> + <dcterms:format rdf:resource="http://publications.europa.eu/resource/authority/file-type/JSON"/> + <dcterms:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2024-06-18T05:20:07.232559</dcterms:issued> + <dcterms:license rdf:resource="http://dcat-ap.de/def/licenses/cc-zero"/> + <dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2024-06-18T05:20:07.191976</dcterms:modified> + <dcterms:rights rdf:resource="http://dcat-ap.de/def/licenses/cc-zero"/> + <dcterms:title>Frictionless Data Resource</dcterms:title> + <spdx:checksum rdf:nodeID="n6747fd43db2143cca14c39970555b181b5"/> + <dcat:accessURL rdf:resource="http://localhost:8000/ufo-resource.json"/> + <dcat:byteSize rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">487</dcat:byteSize> + <dcat:downloadURL rdf:resource="http://localhost:8000/ufo-resource.json"/> + <dcat:mediaType rdf:resource="https://www.iana.org/assignments/media-types/application/csv"/> + </rdf:Description> + <rdf:Description rdf:nodeID="n6747fd43db2143cca14c39970555b181b2"> + <rdf:type rdf:resource="http://www.w3.org/ns/dcat#Distribution"/> + <dcterms:format rdf:resource="http://publications.europa.eu/resource/authority/file-type/CSV"/> + <dcterms:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2024-06-18T05:20:07.232559</dcterms:issued> + <dcterms:license rdf:resource="http://dcat-ap.de/def/licenses/cc-zero"/> + <dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2024-06-18T05:20:07.191976</dcterms:modified> + <dcterms:rights rdf:resource="http://dcat-ap.de/def/licenses/cc-zero"/> + <dcterms:title>ufo.csv</dcterms:title> + <spdx:checksum rdf:nodeID="n6747fd43db2143cca14c39970555b181b3"/> + <dcat:accessURL rdf:resource="http://localhost:8000/ufo.csv"/> + <dcat:byteSize rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">151</dcat:byteSize> + <dcat:downloadURL rdf:resource="http://localhost:8000/ufo.csv"/> + <dcat:mediaType rdf:resource="https://www.iana.org/assignments/media-types/application/csv"/> + </rdf:Description> + <rdf:Description rdf:nodeID="n6747fd43db2143cca14c39970555b181b5"> + <rdf:type rdf:resource="http://spdx.org/rdf/terms#Checksum"/> + <spdx:algorithm rdf:resource="http://spdx.org/rdf/terms#checksumAlgorithm_md5"/> + <spdx:checksumValue rdf:datatype="http://www.w3.org/2001/XMLSchema#hexBinary">8dca8b179bbe0d46c5004da5112f6c4c</spdx:checksumValue> + </rdf:Description> + <rdf:Description rdf:nodeID="n6747fd43db2143cca14c39970555b181b3"> + <rdf:type rdf:resource="http://spdx.org/rdf/terms#Checksum"/> + <spdx:algorithm rdf:resource="http://spdx.org/rdf/terms#checksumAlgorithm_sha1"/> + <spdx:checksumValue rdf:datatype="http://www.w3.org/2001/XMLSchema#hexBinary">3ffba0a43d3497a7918b376a335c31fbecc9325b</spdx:checksumValue> + </rdf:Description> + <rdf:Description rdf:nodeID="n6747fd43db2143cca14c39970555b181b1"> + <rdf:type rdf:resource="http://purl.org/dc/terms/PeriodOfTime"/> + <dcat:endDate rdf:datatype="http://www.w3.org/2001/XMLSchema#date">2024-06-17</dcat:endDate> + <dcat:startDate rdf:datatype="http://www.w3.org/2001/XMLSchema#date">2024-06-10</dcat:startDate> + </rdf:Description> +</rdf:RDF> diff --git a/tests/data/text.txt b/tests/data/text.txt new file mode 100644 index 0000000000000000000000000000000000000000..a24d3d5699a013fb4d223a6e293c86caa1ceb35f --- /dev/null +++ b/tests/data/text.txt @@ -0,0 +1 @@ +Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. diff --git a/tests/data/ufo-resource.json b/tests/data/ufo-resource.json index d05d9c2c4e9b7a1c29605b8f9695917580771024..701306ab8533c00d9c13d8c03bfa10ddd9d19435 100644 --- a/tests/data/ufo-resource.json +++ b/tests/data/ufo-resource.json @@ -1,7 +1,7 @@ { "name": "ufo", "type": "table", - "path": "http://localhost:8000/ufo.csv", + "path": "tests/data/ufo.csv", "format": "csv", "mediatype": "text/csv", "encoding": "utf-8", diff --git a/tests/data/valid.docx b/tests/data/valid.docx new file mode 100644 index 0000000000000000000000000000000000000000..2fc6b99e13b6734528592b091b1fa76f3dbd8b2c Binary files /dev/null and b/tests/data/valid.docx differ diff --git a/tests/data/valid.ods b/tests/data/valid.ods new file mode 100644 index 0000000000000000000000000000000000000000..3726fbeac976ef3a6a7399dd2db5682b88c513b4 Binary files /dev/null and b/tests/data/valid.ods differ diff --git a/tests/data/valid.odt b/tests/data/valid.odt new file mode 100644 index 0000000000000000000000000000000000000000..f4802adff3a9191037421a3ccd64151e95d8fe1f Binary files /dev/null and b/tests/data/valid.odt differ diff --git a/tests/data/valid.pdf b/tests/data/valid.pdf new file mode 100644 index 0000000000000000000000000000000000000000..d30b784dfa4a4d7ace9109d100f460b92df8663b Binary files /dev/null and b/tests/data/valid.pdf differ diff --git a/tests/data/valid.tif b/tests/data/valid.tif new file mode 100644 index 0000000000000000000000000000000000000000..37c68216a82fe457b00e1f3fd6169f764c0933e3 Binary files /dev/null and b/tests/data/valid.tif differ diff --git a/tests/data/valid.xlsx b/tests/data/valid.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..5f1a3bcdf0411c4b605d5fa51708daf42dc48ae7 Binary files /dev/null and b/tests/data/valid.xlsx differ diff --git a/tests/data/valid_geotiff.tif b/tests/data/valid_geotiff.tif new file mode 100644 index 0000000000000000000000000000000000000000..2fd42d80c8dd69963288689ff84a58009dbdd6d0 Binary files /dev/null and b/tests/data/valid_geotiff.tif differ diff --git a/tests/test_all_formats.py b/tests/test_all_formats.py new file mode 100644 index 0000000000000000000000000000000000000000..b2b8fd9d0d51797a984b6d255509c4839da219c8 --- /dev/null +++ b/tests/test_all_formats.py @@ -0,0 +1,26 @@ +import unittest +import importlib +import pkgutil +import tempfile + + +class TestAllFormats(unittest.TestCase): + def test_load_all_modules(self): + """Make sure that every format module has been loaded at least once. + Otherwise, the code coverage will not know about the file.""" + package = importlib.import_module("formats") + modules = [module.name for module in pkgutil.iter_modules(package.__path__)] + for module in modules: + format_check_module = importlib.import_module("formats." + module) + with tempfile.NamedTemporaryFile(delete=True) as temp_file: + resource = {} + resource["url"] = "https://test.invalid/data" + try: + format_check_module.is_valid(resource, temp_file) + except Exception as e: + print(f"Module for format {module} failed.") + raise (e) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_atom_format.py b/tests/test_atom_format.py new file mode 100644 index 0000000000000000000000000000000000000000..902b3339471f32dfc620d15a6402b903704b6517 --- /dev/null +++ b/tests/test_atom_format.py @@ -0,0 +1,27 @@ +import unittest +from formats.atom_format import is_valid + + +class TestAtomFormat(unittest.TestCase): + def test_is_valid__valid(self): + resource = {} + with open("tests/data/Atom_SH_Feldblockfinder_OpenGBD.xml", "r") as file: + self.assertTrue(is_valid(resource, file)) + self.assertIsNone(resource.get("error")) + + def test_is_valid__other_xml(self): + resource = {} + with open("tests/data/correct.xml", "r") as file: + self.assertFalse(is_valid(resource, file)) + self.assertIsNotNone(resource.get("error")) + + def test_is_valid__invalid_xml(self): + resource = {} + with open("tests/data/incorrect.xml", "r") as file: + self.assertFalse(is_valid(resource, file)) + self.assertIsNotNone(resource.get("error")) + + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_format_fidelity_checker.py b/tests/test_dcat_catalog_check.py similarity index 73% rename from tests/test_format_fidelity_checker.py rename to tests/test_dcat_catalog_check.py index 6c21860f2aab0768936f1b38a649221aebde3a11..9100048b71d8d956a3bf72d7a626247c880bf86a 100644 --- a/tests/test_format_fidelity_checker.py +++ b/tests/test_dcat_catalog_check.py @@ -7,14 +7,19 @@ from dcat_catalog_check import ( ) from rdflib import Graph from rdflib.namespace import RDF, DCAT +from http.server import SimpleHTTPRequestHandler, HTTPServer +import threading +import time +import io +import requests +import sys class TestDcatCatalogCheck(unittest.TestCase): def setUp(self): - self.dcc = DcatCatalogCheck( - "http://localhost:8000/", "my_api_key") + self.dcc = DcatCatalogCheck("http://test.invalid:8000/", "my_api_key") # Mock the logger to capture log messages - self.logger_patch = patch.object(self.dcc, 'logger', MagicMock()) + self.logger_patch = patch.object(self.dcc, "logger", MagicMock()) self.mock_logger = self.logger_patch.start() def tearDown(self): @@ -30,13 +35,10 @@ class TestDcatCatalogCheck(unittest.TestCase): "XML": ["application/xml"], } - self.assertTrue(self.dcc.is_mime_type_compatible( - "JSON", "application/json")) - self.assertFalse(self.dcc.is_mime_type_compatible( - "JSON", "application/xml")) + self.assertTrue(self.dcc.is_mime_type_compatible("JSON", "application/json")) + self.assertFalse(self.dcc.is_mime_type_compatible("JSON", "application/xml")) self.assertFalse( - self.dcc.is_mime_type_compatible( - "UnknownFormat", "application/json") + self.dcc.is_mime_type_compatible("UnknownFormat", "application/json") ) def test_read_allowed_file_formats(self): @@ -48,8 +50,7 @@ class TestDcatCatalogCheck(unittest.TestCase): ): formats = self.dcc.read_allowed_file_formats() self.assertEqual( - formats, {"JSON": ["application/json"], - "XML": ["application/xml"]} + formats, {"JSON": ["application/json"], "XML": ["application/xml"]} ) def test_load_uri_replacements(self): @@ -59,10 +60,8 @@ class TestDcatCatalogCheck(unittest.TestCase): read_data='[{"regex": "old", "replaced_by": "new"}]' ), ): - replacements = self.dcc.load_uri_replacements() - self.assertEqual( - replacements, [{"regex": "old", "replaced_by": "new"}]) + self.assertEqual(replacements, [{"regex": "old", "replaced_by": "new"}]) # Simulate that the file does not exist @@ -111,7 +110,7 @@ class TestDcatCatalogCheck(unittest.TestCase): self.dcc.load_http_complete = MagicMock(return_value=mock_response) resource = {} - resource["url"] = "http://localhost/data" + resource["url"] = "http://test.invalid/data" resource["format"] = "JSON" self.dcc.check_resource(resource) self.assertEqual(resource["accessible"], True) @@ -128,7 +127,7 @@ class TestDcatCatalogCheck(unittest.TestCase): self.dcc.load_http_complete = MagicMock(return_value=mock_response) resource = {} - resource["url"] = "http://localhost/data" + resource["url"] = "http://test.invalid/data" resource["format"] = "JSON" self.dcc.check_resource(resource) self.assertEqual(resource["accessible"], True) @@ -146,7 +145,7 @@ class TestDcatCatalogCheck(unittest.TestCase): self.dcc.load_http_complete = MagicMock(return_value=mock_response) resource = {} - resource["url"] = "http://localhost/data" + resource["url"] = "http://test.invalid/data" resource["format"] = "JSON" self.dcc.check_resource(resource) self.assertEqual(resource["accessible"], True) @@ -164,7 +163,7 @@ class TestDcatCatalogCheck(unittest.TestCase): self.dcc.load_http_complete = MagicMock(return_value=mock_response) resource = {} - resource["url"] = "http://localhost/data" + resource["url"] = "http://test.invalid/data" resource["format"] = "JSON" self.dcc.check_resource(resource) self.assertEqual(resource["accessible"], True) @@ -182,7 +181,7 @@ class TestDcatCatalogCheck(unittest.TestCase): self.dcc.load_http_complete = MagicMock(return_value=mock_response) resource = {} - resource["url"] = "http://localhost/data" + resource["url"] = "http://test.invalid/data" resource["format"] = "JSON" self.dcc.check_resource(resource) self.assertEqual(resource["accessible"], True) @@ -198,7 +197,7 @@ class TestDcatCatalogCheck(unittest.TestCase): self.dcc.load_http_complete = MagicMock(return_value=mock_response) resource = {} - resource["url"] = "http://localhost/data" + resource["url"] = "http://test.invalid/data" resource["format"] = "XML" self.dcc.check_resource(resource) self.assertEqual(resource["accessible"], True) @@ -214,7 +213,7 @@ class TestDcatCatalogCheck(unittest.TestCase): self.dcc.load_http_complete = MagicMock(return_value=mock_response) resource = {} - resource["url"] = "http://localhost/data" + resource["url"] = "http://test.invalid/data" resource["format"] = "PNG" resource["checksum_algorithm"] = ( "http://spdx.org/rdf/terms#checksumAlgorithm_sha1" @@ -247,7 +246,7 @@ class TestDcatCatalogCheck(unittest.TestCase): self.dcc.load_http_complete = MagicMock(return_value=mock_response) resource = {} - resource["url"] = "http://localhost/data" + resource["url"] = "http://test.invalid/data" resource["format"] = "JSON" self.dcc.check_resource(resource) self.assertEqual(resource.get("accessible"), True) @@ -266,7 +265,7 @@ class TestDcatCatalogCheck(unittest.TestCase): self.dcc.load_http_complete = MagicMock(return_value=mock_response) resource = {} - resource["url"] = "http://localhost/data" + resource["url"] = "http://test.invalid/data" resource["format"] = "JSON" self.dcc.check_resource(resource) self.assertEqual(resource.get("accessible"), True) @@ -285,7 +284,7 @@ class TestDcatCatalogCheck(unittest.TestCase): self.dcc.load_http_complete = MagicMock(return_value=mock_response) resource = {} - resource["url"] = "http://localhost/data" + resource["url"] = "http://test.invalid/data" resource["format"] = "JSON" self.dcc.check_resource(resource) self.assertEqual(resource.get("accessible"), True) @@ -312,7 +311,7 @@ class TestDcatCatalogCheck(unittest.TestCase): self.dcc.load_http_complete = MagicMock(return_value=mock_response) resource = {} - resource["url"] = "http://localhost/zos116.zip" + resource["url"] = "http://test.invalid/zos116.zip" resource["format"] = "SHP" self.dcc.check_resource(resource) @@ -326,7 +325,7 @@ class TestDcatCatalogCheck(unittest.TestCase): # Test data to simulate the contents of previous_results.json test_data = [ {"url": "http://example.com", "status": "valid", "format": "JSON"}, - {"url": "http://example.org", "status": "invalid", "format": "XML"} + {"url": "http://example.org", "status": "invalid", "format": "XML"}, ] # Write test data to a file 'previous_results.json' @@ -342,9 +341,11 @@ class TestDcatCatalogCheck(unittest.TestCase): self.assertIn("http://example.com", self.dcc.previous_results) self.assertIn("http://example.org", self.dcc.previous_results) self.assertEqual( - self.dcc.previous_results["http://example.com"]["status"], "valid") + self.dcc.previous_results["http://example.com"]["status"], "valid" + ) self.assertEqual( - self.dcc.previous_results["http://example.org"]["status"], "invalid") + self.dcc.previous_results["http://example.org"]["status"], "invalid" + ) @patch("os.path.exists", return_value=False) def test_read_previous_results_file_not_exist(self, mock_exists): @@ -365,7 +366,12 @@ class TestDcatCatalogCheck(unittest.TestCase): "Invalid JSON at line 1: Expecting value: line 1 column 1 (char 0)" ) - @patch("builtins.open", mock_open(read_data='{"status": "valid", "format": "JSON"}\n{"url": "http://example.com", "status": "valid", "format": "JSON"}')) + @patch( + "builtins.open", + mock_open( + read_data='{"status": "valid", "format": "JSON"}\n{"url": "http://example.com", "status": "valid", "format": "JSON"}' + ), + ) @patch("os.path.exists", return_value=True) def test_read_previous_results_missing_url(self, mock_exists): """Test when the file has a line with missing 'url'.""" @@ -375,6 +381,117 @@ class TestDcatCatalogCheck(unittest.TestCase): 'Line 1 is missing \'url\': {"status": "valid", "format": "JSON"}' ) + def test_apply_uri_replacements(self): + """Test the apply_uri_replacements method.""" + # Setup URI replacements + self.dcc.uri_replacements = [ + {"regex": r"example\.com", "replaced_by": "test.com"}, + {"regex": r"http://", "replaced_by": "https://"}, + ] + + # URL matching both replacements + url = "http://example.com/path" + result = self.dcc.apply_uri_replacements(url) + self.assertEqual(result, "https://test.com/path") + + # URL matching only one replacement + url = "http://other.com/path" + result = self.dcc.apply_uri_replacements(url) + self.assertEqual(result, "https://other.com/path") + + # URL with no matches + url = "https://unchanged.com/path" + result = self.dcc.apply_uri_replacements(url) + self.assertEqual(result, "https://unchanged.com/path") + + # Empty URL + url = "" + result = self.dcc.apply_uri_replacements(url) + self.assertEqual(result, "") + + # No URI replacements defined + self.dcc.uri_replacements = [] + url = "http://example.com/path" + result = self.dcc.apply_uri_replacements(url) + self.assertEqual(result, "http://example.com/path") + + def test_clear_result(self): + """Test the _clear_result method.""" + # Define a resource dictionary with keys to clear and some additional keys + resource = { + "accessible": True, + "checksum_ok": True, + "duration": 1.23, + "error": "Some error", + "etag": "some-etag", + "http_status": 200, + "last_check": "2024-12-27T12:34:56Z", + "mimetype": "application/json", + "mimetype_mismatch": False, + "valid": True, + "url": "http://example.com/data", # This key should remain untouched + "format": "JSON", # This key should remain untouched + } + + # Call the _clear_result method + self.dcc._clear_result(resource) + + # Check that all keys to clear have been removed + for key in [ + "accessible", + "checksum_ok", + "duration", + "error", + "etag", + "http_status", + "last_check", + "mimetype", + "mimetype_mismatch", + "valid", + ]: + self.assertNotIn(key, resource) + + # Check that unrelated keys remain + self.assertIn("url", resource) + self.assertIn("format", resource) + self.assertEqual(resource["url"], "http://example.com/data") + self.assertEqual(resource["format"], "JSON") + + def _wait_for_server(self, url, timeout=10, interval=0.2): + """Wait until the server can be reached at the specified URL.""" + start_time = time.time() + while time.time() - start_time < timeout: + try: + response = requests.get(url) + if response.status_code == 200: + return True + except requests.exceptions.RequestException: + pass + time.sleep(interval) + print(f"Timeout reached: Server at {url} not reachable.") + return False + + def _run_server(self): + server_address = ("", 8000) + httpd = HTTPServer(server_address, SimpleHTTPRequestHandler) + httpd.serve_forever() + + def test_read_dcat_catalog(self): + server_thread = threading.Thread(target=self._run_server) + server_thread.daemon = True + server_thread.start() + self._wait_for_server("http://localhost:8000") + + mock_stdout = io.StringIO() + sys.stdout = mock_stdout + self.dcc.read_dcat_catalog("http://localhost:8000/tests/data/all-tests.ttl") + sys.stdout = sys.__stdout__ + + output = mock_stdout.getvalue() + json_objects = [json.loads(line) for line in output.splitlines()] + + self.assertEqual(len(json_objects), 31) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_docx_format.py b/tests/test_docx_format.py new file mode 100644 index 0000000000000000000000000000000000000000..647b3693b2c616cc6afeb9331812d70b5e492e28 --- /dev/null +++ b/tests/test_docx_format.py @@ -0,0 +1,32 @@ +import unittest +from formats.docx_format import is_valid + + +class TestDocxFormat(unittest.TestCase): + def test_is_valid__valid(self): + resource = {} + with open("tests/data/valid.docx", "r") as file: + self.assertTrue(is_valid(resource, file)) + self.assertIsNone(resource.get("error")) + + def test_is_valid__invalid_no_zip(self): + resource = {} + with open("tests/data/correct.json", "r") as file: + self.assertFalse(is_valid(resource, file)) + self.assertIsNotNone(resource.get("error")) + + def test_is_valid__invalid_no_excel(self): + resource = {} + with open("tests/data/valid.xlsx", "r") as file: + self.assertFalse(is_valid(resource, file)) + self.assertIsNotNone(resource.get("error")) + + def test_is_valid__invalid_zip(self): + resource = {} + with open("tests/data/valid.odt", "r") as file: + self.assertFalse(is_valid(resource, file)) + self.assertIsNotNone(resource.get("error")) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_geojson_format.py b/tests/test_geojson_format.py index de63ebcf67b190ca41c3cbc721744336d770a729..39239a13275e5b9bea750aca2fe74be90fb8a97e 100644 --- a/tests/test_geojson_format.py +++ b/tests/test_geojson_format.py @@ -2,16 +2,18 @@ import unittest from formats.geojson_format import is_valid -class TestShpFormat(unittest.TestCase): +class TestGeoJsonFormat(unittest.TestCase): def test_is_valid__valid(self): resource = {} with open("tests/data/bermuda.geojson", "r") as file: self.assertTrue(is_valid(resource, file)) + self.assertIsNone(resource.get("error")) def test_is_valid__invalid(self): resource = {} with open("tests/data/correct.json", "r") as file: self.assertFalse(is_valid(resource, file)) + self.assertIsNotNone(resource.get("error")) if __name__ == "__main__": diff --git a/tests/test_geotiff_format.py b/tests/test_geotiff_format.py new file mode 100644 index 0000000000000000000000000000000000000000..e3e01644689cdd4005000e2933a810cf8e7a9f97 --- /dev/null +++ b/tests/test_geotiff_format.py @@ -0,0 +1,26 @@ +import unittest +from formats.geotiff_format import is_valid + + +class TestGeotiffFormat(unittest.TestCase): + def test_is_valid__valid(self): + resource = {} + with open("tests/data/valid_geotiff.tif", "r") as file: + self.assertTrue(is_valid(resource, file)) + + def test_is_valid__zip(self): + """The ZIP file contains the TIFF image and a TFW world file.""" + resource = {} + with open("tests/data/geotiff.zip", "r") as file: + self.assertTrue(is_valid(resource, file)) + + def test_is_valid__invalid(self): + resource = {} + with open("tests/data/valid.tif", "r") as file: + self.assertFalse(is_valid(resource, file)) + self.assertIsNotNone(resource.get("error")) + self.assertEqual("missing transformation", resource["error"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_gml_format.py b/tests/test_gml_format.py index a2e40a7d8820ae251db773c05721cc93cd5bd34b..e63638cfa5c2e4616dfc334b28432f25592ab9a2 100644 --- a/tests/test_gml_format.py +++ b/tests/test_gml_format.py @@ -7,11 +7,13 @@ class TestGmlFormat(unittest.TestCase): resource = {} with open("tests/data/bermuda.gml", "r") as file: self.assertTrue(is_valid(resource, file)) + self.assertIsNone(resource.get("error")) def test_is_valid__invalid(self): resource = {} with open("tests/data/correct.xml", "r") as file: self.assertFalse(is_valid(resource, file)) + self.assertIsNotNone(resource.get("error")) if __name__ == "__main__": diff --git a/tests/test_jpeg_format.py b/tests/test_jpeg_format.py new file mode 100644 index 0000000000000000000000000000000000000000..370409d9fd4131338053d2e27f2a7e74663b85f1 --- /dev/null +++ b/tests/test_jpeg_format.py @@ -0,0 +1,18 @@ +import unittest +from formats.jpeg_format import is_valid + + +class TestJpegFormat(unittest.TestCase): + def test_is_valid__valid(self): + resource = {} + with open("tests/data/image.jpeg", "r") as file: + self.assertTrue(is_valid(resource, file)) + + def test_is_valid__invalid(self): + resource = {} + with open("tests/data/image.png", "r") as file: + self.assertFalse(is_valid(resource, file)) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_json_format.py b/tests/test_json_format.py index 2f579ab8105a956aac4816b33ee2767ff436bccd..23a892cc948f247d50ad92ad3e3ae5d3c0ca2caf 100644 --- a/tests/test_json_format.py +++ b/tests/test_json_format.py @@ -15,6 +15,13 @@ class TestJsonFormat(unittest.TestCase): self.assertFalse(is_valid(resource, file)) self.assertIsNotNone(resource.get("error")) + def test_is_valid__frictionless_valid(self): + resource = {} + with open("tests/data/ufo-resource.json", "r") as file: + self.assertTrue(is_valid(resource, file)) + self.assertIsNone(resource.get("error")) + self.assertTrue(resource.get("schema_valid")) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_ods_format.py b/tests/test_ods_format.py new file mode 100644 index 0000000000000000000000000000000000000000..e152d5eddf08627d001a69d4f1ff7a11146683d8 --- /dev/null +++ b/tests/test_ods_format.py @@ -0,0 +1,36 @@ +import unittest +from formats.ods_format import is_valid + + +class TestOdsFormat(unittest.TestCase): + def test_is_valid__valid(self): + resource = {} + with open("tests/data/valid.ods", "r") as file: + self.assertTrue(is_valid(resource, file)) + self.assertIsNone(resource.get("error")) + + def test_is_valid__invalid_no_zip(self): + resource = {} + with open("tests/data/correct.json", "r") as file: + self.assertFalse(is_valid(resource, file)) + self.assertIsNotNone(resource.get("error")) + + def test_is_valid__invalid_no_odt(self): + resource = {} + with open("tests/data/valid.odt", "r") as file: + self.assertFalse(is_valid(resource, file)) + self.assertIsNotNone(resource.get("error")) + self.assertEqual( + "Incorrect MIME type: application/vnd.oasis.opendocument.text", + resource["error"], + ) + + def test_is_valid__invalid_zip(self): + resource = {} + with open("tests/data/valid.xlsx", "r") as file: + self.assertFalse(is_valid(resource, file)) + self.assertIsNotNone(resource.get("error")) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_odt_format.py b/tests/test_odt_format.py new file mode 100644 index 0000000000000000000000000000000000000000..8276fe7ffc3490115616eb413c3b40fc3ab9f917 --- /dev/null +++ b/tests/test_odt_format.py @@ -0,0 +1,36 @@ +import unittest +from formats.odt_format import is_valid + + +class TestOdsFormat(unittest.TestCase): + def test_is_valid__valid(self): + resource = {} + with open("tests/data/valid.odt", "r") as file: + self.assertTrue(is_valid(resource, file)) + self.assertIsNone(resource.get("error")) + + def test_is_valid__invalid_no_zip(self): + resource = {} + with open("tests/data/correct.json", "r") as file: + self.assertFalse(is_valid(resource, file)) + self.assertIsNotNone(resource.get("error")) + + def test_is_valid__invalid_no_odt(self): + resource = {} + with open("tests/data/valid.ods", "r") as file: + self.assertFalse(is_valid(resource, file)) + self.assertIsNotNone(resource.get("error")) + self.assertEqual( + "Incorrect MIME type: application/vnd.oasis.opendocument.spreadsheet", + resource["error"], + ) + + def test_is_valid__invalid_zip(self): + resource = {} + with open("tests/data/valid.docx", "r") as file: + self.assertFalse(is_valid(resource, file)) + self.assertIsNotNone(resource.get("error")) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_pdf_format.py b/tests/test_pdf_format.py new file mode 100644 index 0000000000000000000000000000000000000000..1f6e25f87e2d3f056fbb5304e0bd6cb4bc1f552b --- /dev/null +++ b/tests/test_pdf_format.py @@ -0,0 +1,18 @@ +import unittest +from formats.pdf_format import is_valid + + +class TestPdfFormat(unittest.TestCase): + def test_is_valid__valid(self): + resource = {} + with open("tests/data/valid.pdf", "r") as file: + self.assertTrue(is_valid(resource, file)) + + def test_is_valid__invalid(self): + resource = {} + with open("tests/data/image.png", "r") as file: + self.assertFalse(is_valid(resource, file)) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_rdf_format.py b/tests/test_rdf_format.py new file mode 100644 index 0000000000000000000000000000000000000000..8a313f36f32b899de757e64ee65085addaa69295 --- /dev/null +++ b/tests/test_rdf_format.py @@ -0,0 +1,32 @@ +import unittest +from formats.rdf_format import is_valid + + +class TestRdfFormat(unittest.TestCase): + def test_is_valid__valid_turtle(self): + resource = {} + with open("tests/data/ufo.ttl", "r") as file: + self.assertTrue(is_valid(resource, file)) + self.assertIsNone(resource.get("error")) + + def test_is_valid__valid_xml(self): + resource = {} + with open("tests/data/rdf.xml", "r") as file: + self.assertTrue(is_valid(resource, file)) + self.assertIsNone(resource.get("error")) + + def test_is_valid__valid_jsonld(self): + resource = {} + with open("tests/data/rdf.json", "r") as file: + self.assertTrue(is_valid(resource, file)) + self.assertIsNone(resource.get("error")) + + def test_is_valid__invalid(self): + resource = {} + with open("tests/data/correct.json", "r") as file: + self.assertFalse(is_valid(resource, file)) + self.assertIsNotNone(resource.get("error")) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_wmts_format.py b/tests/test_wmts_format.py new file mode 100644 index 0000000000000000000000000000000000000000..9301e22c9092ab52f00277c1b23314ac6663c28f --- /dev/null +++ b/tests/test_wmts_format.py @@ -0,0 +1,26 @@ +import unittest +from formats.wmts_srvc_format import is_valid + + +class TestWmtsSrvcFormat(unittest.TestCase): + def test_is_valid__valid(self): + resource = {} + resource["url"] = ( + "https://dienste.gdi-sh.invalid/WMTS_SH_ALKIS_OpenGBD/wmts/1.0.0/WMTSCapabilities.xml" + ) + with open("tests/data/WMTSCapabilities.xml", "r") as file: + self.assertTrue(is_valid(resource, file)) + self.assertIsNone(resource.get("error")) + + def test_is_valid__invalid(self): + resource = {} + resource["url"] = ( + "https://dienste.gdi-sh.invalid/WMTS_SH_ALKIS_OpenGBD/wmts/1.0.0/WMTSCapabilities.xml" + ) + with open("tests/data/correct.xml", "r") as file: + self.assertFalse(is_valid(resource, file)) + self.assertIsNotNone(resource.get("error")) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_xlsx_format.py b/tests/test_xlsx_format.py new file mode 100644 index 0000000000000000000000000000000000000000..3de99678c67ccfacc82559384de1792684fefae2 --- /dev/null +++ b/tests/test_xlsx_format.py @@ -0,0 +1,32 @@ +import unittest +from formats.xlsx_format import is_valid + + +class TestXlsxFormat(unittest.TestCase): + def test_is_valid__valid(self): + resource = {} + with open("tests/data/valid.xlsx", "r") as file: + self.assertTrue(is_valid(resource, file)) + self.assertIsNone(resource.get("error")) + + def test_is_valid__invalid_no_zip(self): + resource = {} + with open("tests/data/correct.json", "r") as file: + self.assertFalse(is_valid(resource, file)) + self.assertIsNotNone(resource.get("error")) + + def test_is_valid__invalid_no_excel(self): + resource = {} + with open("tests/data/valid.docx", "r") as file: + self.assertFalse(is_valid(resource, file)) + self.assertIsNotNone(resource.get("error")) + + def test_is_valid__invalid_zip(self): + resource = {} + with open("tests/data/valid.ods", "r") as file: + self.assertFalse(is_valid(resource, file)) + self.assertIsNotNone(resource.get("error")) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_xml_format.py b/tests/test_xml_format.py new file mode 100644 index 0000000000000000000000000000000000000000..5672f9f4e6ef3342e443d960dcb01090f28015a9 --- /dev/null +++ b/tests/test_xml_format.py @@ -0,0 +1,20 @@ +import unittest +from formats.xml_format import is_valid + + +class TestXmlFormat(unittest.TestCase): + def test_is_valid__valid(self): + resource = {} + with open("tests/data/correct.xml", "r") as file: + self.assertTrue(is_valid(resource, file)) + self.assertIsNone(resource.get("error")) + + def test_is_valid__invalid(self): + resource = {} + with open("tests/data/incorrect.xml", "r") as file: + self.assertFalse(is_valid(resource, file)) + self.assertIsNotNone(resource.get("error")) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_zip_format.py b/tests/test_zip_format.py new file mode 100644 index 0000000000000000000000000000000000000000..61617376b0a56cfec1b5c75f61d9f65d128286ae --- /dev/null +++ b/tests/test_zip_format.py @@ -0,0 +1,21 @@ +import unittest +from formats.zip_format import is_valid + + +class TestZipFormat(unittest.TestCase): + def test_is_valid__valid(self): + resource = {} + with open("tests/data/bermuda.zip", "r") as file: + self.assertTrue(is_valid(resource, file)) + self.assertIsNone(resource.get("error")) + + def test_is_valid__invalid(self): + resource = {} + with open("tests/data/correct.xml", "r") as file: + self.assertFalse(is_valid(resource, file)) + self.assertIsNotNone(resource.get("error")) + self.assertEqual("Not a ZIP file.", resource["error"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/tools/generate_report.py b/tools/generate_report.py index 78e1433a834ba0a82f072c6a7093be503a93eab5..5acf6fb4cd4d91bcbe5c998cb4d2771c368cf6c2 100755 --- a/tools/generate_report.py +++ b/tools/generate_report.py @@ -11,14 +11,14 @@ def write_diagram(id, title, counter, counter_publisher): print(f"<h2>{title}</h2>") print(f"<div id='vis{id}' style='max-width: 400px;'></div>") print('<script type="text/javascript">') - + print(f"new ApexCharts(document.querySelector('#vis{id}'),") print("{ chart: { type: 'donut' },") print(f"series: [{counter[True]}, {counter[False]}, {counter[None]}],") print("labels: ['korrekt', 'fehlerhaft', 'nicht geprüft'],") print('colors: ["#1eae9c", "#d4004b", "#a4adb6"]') print("}).render();") - + print("</script>") print("<h3>Publishers affected</h3>") @@ -88,7 +88,9 @@ print("<html>") print(" <head>") print(" <title>DCAT Catalog Check</title>") print(' <script src="https://cdn.jsdelivr.net/npm/apexcharts"></script>') -print(' <link rel="stylesheet" href="https://cdn.datatables.net/2.1.8/css/dataTables.dataTables.css" />') +print( + ' <link rel="stylesheet" href="https://cdn.datatables.net/2.1.8/css/dataTables.dataTables.css" />' +) print(' <script src="https://code.jquery.com/jquery-3.7.1.min.js"></script>') print(' <script src="https://cdn.datatables.net/2.1.8/js/dataTables.js"></script>') print(" </head>") @@ -113,17 +115,54 @@ print("<div>") print("<h2>Distributionen with errors</h2>") print('<table class="table" id="distributions">') -print("<thead><tr><th>Publisher</th><th>Format</th><th>available</th><th>content correct</th><th>MIME type wrong</th><th>MIME type</th><th>checksum correct</th><th>schema valid</th><th>URL</th></tr></thead>") +print( + "<thead><tr><th>Publisher</th><th>Format</th><th>accessible</th><th>HTTP status</th><th>content correct</th><th>MIME type wrong</th><th>MIME type</th><th>checksum correct</th><th>schema valid</th><th>URL</th><th>Error message</th></tr>" +) +print("<tr>") +print('<th><input type="text" placeholder="Filter by publisher" /></th>') +print('<th><input type="text" placeholder="Filter by format" /></th>') +print('<th><input type="text" placeholder="Filter by accessibility" /></th>') +print('<th><input type="text" placeholder="Filter by HTTP status" /></th>') +print('<th><input type="text" placeholder="Filter by correct content" /></th>') +print('<th><input type="text" placeholder="Filter by MIME type error" /></th>') +print('<th><input type="text" placeholder="Filter by MIME type" /></th>') +print('<th><input type="text" placeholder="Filter by checksum" /></th>') +print('<th><input type="text" placeholder="Filter by schema valid" /></th>') +print('<th><input type="text" placeholder="Filter by url" /></th>') +print('<th><input type="text" placeholder="Filter by error message" /></th>') +print("</tr>") +print("</thead>") print("<tbody>") for dist in distributions_with_problems: entry = distributions_with_problems[dist] - print(f"<tr><td>{entry.get('publisher')}</td><td>{entry.get('format')}</td><td>{entry.get('http_status','')}</td><td>{entry.get('valid','')}</td><td>{entry.get('mimetype_mismatch','')}</td><td>{entry.get('mimetype','')}</td><td>{entry.get('checksum_ok','')}</td><td>{entry.get('schema_valid','')}</td><td>{entry.get('url')}</td></tr>") + print( + f"<tr><td>{entry.get('publisher')}</td><td>{entry.get('format')}</td><td>{entry.get('accessible','')}</td><td>{entry.get('http_status','')}</td><td>{entry.get('valid','')}</td><td>{entry.get('mimetype_mismatch','')}</td><td>{entry.get('mimetype','')}</td><td>{entry.get('checksum_ok','')}</td><td>{entry.get('schema_valid','')}</td><td>{entry.get('url')}</td><td>{entry.get('error')}</td></tr>" + ) print("</tbody></table>") print("</div>") -print("<script>let table = new DataTable('#distributions');</script>") +print(""" +<script> + $(document).ready(function() { + var table = $('#distributions').DataTable(); + + table.columns().every(function() { + var that = this; + + $('input', this.header()).on('keyup change', function() { + if (that.search() !== this.value) { + that + .search(this.value) + .draw(); + } + }); + }); + }); + +</script> +""") print("</body></html>")