Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env python3
import json
from collections import Counter
logfile_path = "result.jsonl"
def write_diagram(id, title, counter, counter_publisher):
print("<div>")
print(f"<h2>{title}</h2>")
print(f"<div id='vis{id}' style='max-width: 400px;'></div>")
print('<script type="text/javascript">')
print(f"new ApexCharts(document.querySelector('#vis{id}'),")
print("{ chart: { type: 'donut' },")
print(f"series: [{counter[True]}, {counter[False]}, {counter[None]}],")
print("labels: ['korrekt', 'fehlerhaft', 'nicht geprüft'],")
print('colors: ["#1eae9c", "#d4004b", "#a4adb6"]')
print("}).render();")
print("</script>")
print("<h3>Publishers affected</h3>")
print("<table>")
for p in counter_publisher:
print(f"<tr><td>{p}</td><td>{counter_publisher[p]}</td></tr>")
print("</table>")
print("<hr/>")
print("</div>")
counter_valid = Counter()
counter_404 = Counter()
counter_accessible = Counter()
counter_mimetype_mismatch = Counter()
counter_checksum_ok = Counter()
counter_schema_valid = Counter()
counter_publisher_accessible = Counter()
counter_publisher_checksum = Counter()
counter_publisher_valid = Counter()
counter_publisher_schema_valid = Counter()
counter_publisher_mimetype_mismatch = Counter()
distributions_with_problems = {}
with open(logfile_path, "r") as file:
for line in file:
entry = json.loads(line.strip())
publisher = entry.get("publisher")
id = entry.get("id")
valid = entry.get("valid", None)
counter_valid[valid] += 1
if valid is False:
counter_publisher_valid[publisher] += 1
distributions_with_problems[id] = entry
accessible = entry.get("accessible", None)
counter_accessible[accessible] += 1
if accessible is False:
counter_publisher_accessible[publisher] += 1
distributions_with_problems[id] = entry
checksum_ok = entry.get("checksum_ok", None)
counter_checksum_ok[checksum_ok] += 1
if checksum_ok is False:
counter_publisher_checksum[publisher] += 1
distributions_with_problems[id] = entry
schema_valid = entry.get("schema_valid", None)
counter_schema_valid[schema_valid] += 1
if schema_valid is False:
counter_publisher_schema_valid[publisher] += 1
distributions_with_problems[id] = entry
mimetype_correct = not entry.get("mimetype_mismatch", False)
counter_mimetype_mismatch[mimetype_correct] += 1
if mimetype_correct is False:
counter_publisher_mimetype_mismatch[publisher] += 1
distributions_with_problems[id] = entry
print("<!doctype html>")
print("<html>")
print(" <head>")
print(" <title>DCAT Catalog Check</title>")
print(' <script src="https://cdn.jsdelivr.net/npm/apexcharts"></script>')
print(' <link rel="stylesheet" href="https://cdn.datatables.net/2.1.8/css/dataTables.dataTables.css" />')
print(' <script src="https://code.jquery.com/jquery-3.7.1.min.js"></script>')
print(' <script src="https://cdn.datatables.net/2.1.8/js/dataTables.js"></script>')
print(" </head>")
print(" <body style='background: #f2f4f7;'>")
print(" <h1>Results of the DCAT Catalog Check</h1>")
write_diagram("1", "Availability", counter_accessible, counter_publisher_accessible)
write_diagram("2", "File content", counter_valid, counter_publisher_valid)
write_diagram(
"3",
"MIME type",
counter_mimetype_mismatch,
counter_publisher_mimetype_mismatch,
)
write_diagram("4", "Checksum", counter_checksum_ok, counter_publisher_checksum)
write_diagram(
"5", "Frictionless Schema", counter_schema_valid, counter_publisher_schema_valid
)
print("<div>")
print("<h2>Distributionen with errors</h2>")
print('<table class="table" id="distributions">')
print("<thead><tr><th>Publisher</th><th>Format</th><th>available</th><th>content correct</th><th>MIME type wrong</th><th>MIME type</th><th>checksum correct</th><th>schema valid</th><th>URL</th></tr></thead>")
print("<tbody>")
for dist in distributions_with_problems:
entry = distributions_with_problems[dist]
print(f"<tr><td>{entry.get('publisher')}</td><td>{entry.get('format')}</td><td>{entry.get('http_status','')}</td><td>{entry.get('valid','')}</td><td>{entry.get('mimetype_mismatch','')}</td><td>{entry.get('mimetype','')}</td><td>{entry.get('checksum_ok','')}</td><td>{entry.get('schema_valid','')}</td><td>{entry.get('url')}</td></tr>")
print("</tbody></table>")
print("</div>")
print("<script>let table = new DataTable('#distributions');</script>")
print("</body></html>")