Skip to content
Snippets Groups Projects
Verified Commit 0bf03353 authored by Jesper Zedlitz's avatar Jesper Zedlitz
Browse files

improved report generation

- list distributions for each type of error
- add explanatory text for each type of error
parent 6450bd42
No related branches found
No related tags found
No related merge requests found
Pipeline #1802 passed with warnings
......@@ -6,9 +6,10 @@ from collections import Counter
logfile_path = "result.jsonl"
def write_diagram(id, title, counter, counter_publisher):
def write_diagram(id, title, counter, counter_publisher, check, explanatory_text):
print("<div>")
print(f"<h2>{title}</h2>")
print(f"<p>{explanatory_text}</p>")
print(f"<div id='vis{id}' style='max-width: 400px;'></div>")
print('<script type="text/javascript">')
......@@ -21,13 +22,38 @@ def write_diagram(id, title, counter, counter_publisher):
print("</script>")
print("<h3>Publishers affected</h3>")
print("<table>")
print(f'<div class="accordion" id="accordion{id}">')
for p in counter_publisher:
print(f"<tr><td>{p}</td><td>{counter_publisher[p]}</td></tr>")
i = 0
for p in sorted(counter_publisher):
i = i + 1
print('<div class="accordion-item">')
print('<h3 class="accordion-header">')
print(
f'<button class="accordion-button collapsed" type="button" data-bs-toggle="collapse" data-bs-target="#collapse_{id}_{i}" aria-controls="collapse_{id}_{i}">'
)
print(f"{p} &#160; <strong>{counter_publisher[p]}</strong>")
print("</button></h3>")
print(
f'<div id="collapse_{id}_{i}" class="accordion-collapse collapse" data-bs-parent="#accordion{id}">'
)
print('<div class="accordion-body">')
print('<div class="row">')
print(
'<div class="col-md-6"><strong>distribution</strong></div><div class="col-md-6"><strong>dataset</strong></div>'
)
for dist in distributions_with_problems:
entry = distributions_with_problems[dist]
if p == entry.get("publisher") and check(entry) is False:
print(
f"<div class=\"col-md-6\">{entry.get('url')}</div><div class=\"col-md-6\">{entry.get('dataset')}</div>"
)
print("</table>")
print("</div>")
print("</div></div></div>")
print("</div>")
print("<hr/>")
print("</div>")
......@@ -45,6 +71,12 @@ counter_publisher_valid = Counter()
counter_publisher_schema_valid = Counter()
counter_publisher_mimetype_mismatch = Counter()
is_valid = lambda entry: entry.get("valid", None)
is_accessible = lambda entry: entry.get("accessible", None)
is_checksum_ok = lambda entry: entry.get("checksum_ok", None)
is_schema_valid = lambda entry: entry.get("schema_valid", None)
is_mimetype_correct = lambda entry: not entry.get("mimetype_mismatch", False)
distributions_with_problems = {}
with open(logfile_path, "r") as file:
......@@ -53,31 +85,31 @@ with open(logfile_path, "r") as file:
publisher = entry.get("publisher")
id = entry.get("id")
valid = entry.get("valid", None)
valid = is_valid(entry)
counter_valid[valid] += 1
if valid is False:
counter_publisher_valid[publisher] += 1
distributions_with_problems[id] = entry
accessible = entry.get("accessible", None)
accessible = is_accessible(entry)
counter_accessible[accessible] += 1
if accessible is False:
counter_publisher_accessible[publisher] += 1
distributions_with_problems[id] = entry
checksum_ok = entry.get("checksum_ok", None)
checksum_ok = is_checksum_ok(entry)
counter_checksum_ok[checksum_ok] += 1
if checksum_ok is False:
counter_publisher_checksum[publisher] += 1
distributions_with_problems[id] = entry
schema_valid = entry.get("schema_valid", None)
schema_valid = is_schema_valid(entry)
counter_schema_valid[schema_valid] += 1
if schema_valid is False:
counter_publisher_schema_valid[publisher] += 1
distributions_with_problems[id] = entry
mimetype_correct = not entry.get("mimetype_mismatch", False)
mimetype_correct = is_mimetype_correct(entry)
counter_mimetype_mismatch[mimetype_correct] += 1
if mimetype_correct is False:
counter_publisher_mimetype_mismatch[publisher] += 1
......@@ -88,6 +120,9 @@ print("<html>")
print(" <head>")
print(" <title>DCAT Catalog Check</title>")
print(' <script src="https://cdn.jsdelivr.net/npm/apexcharts"></script>')
print(
' <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-QWTKZyjpPEjISv5WaRU9OFeRpok6YctnYmDr5pNlyT2bRjXh0JMhjY6hW+ALEwIH" crossorigin="anonymous">'
)
print(
' <link rel="stylesheet" href="https://cdn.datatables.net/2.1.8/css/dataTables.dataTables.css" />'
)
......@@ -98,22 +133,68 @@ print(" <body style='background: #f2f4f7;'>")
print(" <h1>Results of the DCAT Catalog Check</h1>")
write_diagram("1", "Availability", counter_accessible, counter_publisher_accessible)
write_diagram("2", "File content", counter_valid, counter_publisher_valid)
write_diagram(
"1",
"Availability",
counter_accessible,
counter_publisher_accessible,
is_accessible,
"""
Diese Dateien können gar nicht geladen werden.
Mögliche Ursachen dafür sind: Die Datei ist gar nicht vorhanden (HTTP Fehler 404). Sie liegen hinter einem Passwortschutz (HTTP Fehler 403).
Mit dem Webserver ist gar keine Kommunikation möglich, da die Zertifikat-Konfiguration fehlerhaft ist.
""",
)
write_diagram(
"2",
"File content",
counter_valid,
counter_publisher_valid,
is_valid,
"""
Bei der technisch-inhaltlichen Prüfung ist ein Fehler gefunden worden. Ein Beispiel könnte eine JSON-Datei sein, die nicht syntaktisch korrekt ist.
Hier werden auch Dateien aufgeführt, die ein völlig anderes Format haben als angegeben. Wenn beispielsweise eine PNG-Datei versprochen wird, dann
würde eine gelieferte JPEG-Datei hier als Fehler auftauchen (da sie keine PNG-Datei ist).
""",
)
write_diagram(
"3",
"MIME type",
counter_mimetype_mismatch,
counter_publisher_mimetype_mismatch,
is_mimetype_correct,
"""
Zwar wurde die Datei nicht inhaltlich-technisch geprüft, aber der Webserver hat einene anderen MIME-Type als im Katalog angegeben zurückgeliefert.
""",
)
write_diagram("4", "Checksum", counter_checksum_ok, counter_publisher_checksum)
write_diagram(
"5", "Frictionless Schema", counter_schema_valid, counter_publisher_schema_valid
"4",
"Checksum",
counter_checksum_ok,
counter_publisher_checksum,
is_checksum_ok,
"""
Die Prüfsumme der Datei stimmt nicht mit der im Katalog angegebenen Prüfsumme überein. Das kann bedeuten, dass die Datei am Portal vorbei aktualisiert wurde.
Oder das Portal speichert die Prüfsumme falsch.
""",
)
write_diagram(
"5",
"Frictionless Schema",
counter_schema_valid,
counter_publisher_schema_valid,
is_schema_valid,
"""
Für die Datei wurde ein Frictionless Table Scheme angegeben und der Inhalt der Datei passt nicht zu diesem Schema.
""",
)
print("<div>")
print("<h2>Distributionen with errors</h2>")
print("<p>In der nachfolgenden Tabelle sind alle irgendwie fehlerhaften Distributionen zu finden. Über die Kopfzeilen kann man gezielt bestimmte Fehlerbilder oder Herausgeber filtern.</p>")
print('<table class="table" id="distributions">')
print(
"<thead><tr><th>Publisher</th><th>Format</th><th>accessible</th><th>HTTP status</th><th>content correct</th><th>MIME type wrong</th><th>MIME type</th><th>checksum correct</th><th>schema valid</th><th>URL</th><th>Error message</th></tr>"
......@@ -145,6 +226,7 @@ print("</tbody></table>")
print("</div>")
print("""
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/js/bootstrap.bundle.min.js" integrity="sha384-YvpcrYf0tY3lHB60NNkmXc5s9fDVZLESaAA55NDzOxhy9GkcIdslK1eN7N6jIeHz" crossorigin="anonymous"></script>
<script>
$(document).ready(function() {
var table = $('#distributions').DataTable();
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment