Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
D
DCAT Catalog Check
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Open-Data
DCAT Catalog Check
Commits
152647cb
Verified
Commit
152647cb
authored
6 months ago
by
Jesper Zedlitz
Browse files
Options
Downloads
Patches
Plain Diff
store size of the distribution
ruff format
parent
b072c066
Branches
Branches containing commit
Tags
Tags containing commit
1 merge request
!1
Update Formats, Dependencies, and Dockerfile Configuration
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
dcat_catalog_check.py
+26
-37
26 additions, 37 deletions
dcat_catalog_check.py
with
26 additions
and
37 deletions
dcat_catalog_check.py
+
26
−
37
View file @
152647cb
...
@@ -174,8 +174,7 @@ class DcatCatalogCheck:
...
@@ -174,8 +174,7 @@ class DcatCatalogCheck:
format
=
resource
[
"
format
"
].
lower
()
format
=
resource
[
"
format
"
].
lower
()
try
:
try
:
# dynamically import the corresponding module for the format
# dynamically import the corresponding module for the format
format_check_module
=
importlib
.
import_module
(
format_check_module
=
importlib
.
import_module
(
f
"
formats.
{
format
}
_format
"
)
f
"
formats.
{
format
}
_format
"
)
except
ModuleNotFoundError
:
except
ModuleNotFoundError
:
format_check_module
=
None
format_check_module
=
None
...
@@ -194,6 +193,9 @@ class DcatCatalogCheck:
...
@@ -194,6 +193,9 @@ class DcatCatalogCheck:
if
"
etag
"
in
response
.
headers
:
if
"
etag
"
in
response
.
headers
:
resource
[
"
etag
"
]
=
response
.
headers
[
"
etag
"
]
resource
[
"
etag
"
]
=
response
.
headers
[
"
etag
"
]
if
"
content-length
"
in
response
.
headers
:
resource
[
"
size
"
]
=
response
.
headers
[
"
content-length
"
]
except
requests
.
exceptions
.
RequestException
as
err
:
except
requests
.
exceptions
.
RequestException
as
err
:
# Handle connection, timeout, or other request errors
# Handle connection, timeout, or other request errors
resource
[
"
accessible
"
]
=
False
resource
[
"
accessible
"
]
=
False
...
@@ -210,8 +212,7 @@ class DcatCatalogCheck:
...
@@ -210,8 +212,7 @@ class DcatCatalogCheck:
# write the content of the HTTP response into a temporary file
# write the content of the HTTP response into a temporary file
original_file_name
=
url
.
split
(
"
/
"
)[
-
1
]
original_file_name
=
url
.
split
(
"
/
"
)[
-
1
]
suffix
=
original_file_name
.
split
(
suffix
=
original_file_name
.
split
(
"
.
"
)[
-
1
]
if
"
.
"
in
original_file_name
else
""
"
.
"
)[
-
1
]
if
"
.
"
in
original_file_name
else
""
with
tempfile
.
NamedTemporaryFile
(
with
tempfile
.
NamedTemporaryFile
(
delete
=
False
,
suffix
=
"
.
"
+
suffix
delete
=
False
,
suffix
=
"
.
"
+
suffix
)
as
temp_file
:
)
as
temp_file
:
...
@@ -234,8 +235,7 @@ class DcatCatalogCheck:
...
@@ -234,8 +235,7 @@ class DcatCatalogCheck:
decompressor
=
decompressors
.
get
(
resource
[
"
mimetype
"
])
decompressor
=
decompressors
.
get
(
resource
[
"
mimetype
"
])
if
not
decompressor
:
if
not
decompressor
:
self
.
logger
.
warning
(
self
.
logger
.
warning
(
f
"
Unknown compression
{
resource
[
'
mimetype
'
]
}
.
"
)
f
"
Unknown compression
{
resource
[
'
mimetype
'
]
}
.
"
)
else
:
else
:
with
tempfile
.
NamedTemporaryFile
(
delete
=
False
)
as
decompressed_file
:
with
tempfile
.
NamedTemporaryFile
(
delete
=
False
)
as
decompressed_file
:
with
decompressor
.
open
(
temp_file
.
name
,
"
rb
"
)
as
compressed_file
:
with
decompressor
.
open
(
temp_file
.
name
,
"
rb
"
)
as
compressed_file
:
...
@@ -246,8 +246,7 @@ class DcatCatalogCheck:
...
@@ -246,8 +246,7 @@ class DcatCatalogCheck:
resource
[
"
mimetype
"
]
=
self
.
_guess_mime_type
(
temp_file
.
name
)
resource
[
"
mimetype
"
]
=
self
.
_guess_mime_type
(
temp_file
.
name
)
if
self
.
_is_container
(
resource
[
"
mimetype
"
],
resource
[
"
format
"
]):
if
self
.
_is_container
(
resource
[
"
mimetype
"
],
resource
[
"
format
"
]):
self
.
_check_container_file
(
self
.
_check_container_file
(
resource
,
temp_file
,
format_check_module
)
resource
,
temp_file
,
format_check_module
)
else
:
else
:
self
.
_check_single_file
(
resource
,
temp_file
,
format_check_module
)
self
.
_check_single_file
(
resource
,
temp_file
,
format_check_module
)
...
@@ -275,8 +274,7 @@ class DcatCatalogCheck:
...
@@ -275,8 +274,7 @@ class DcatCatalogCheck:
temp_file
.
write
(
file
.
read
())
temp_file
.
write
(
file
.
read
())
temp_file
.
flush
()
temp_file
.
flush
()
resource
[
"
mimetype
"
]
=
self
.
_guess_mime_type
(
resource
[
"
mimetype
"
]
=
self
.
_guess_mime_type
(
temp_file
.
name
)
temp_file
.
name
)
validation_result
=
(
validation_result
=
(
validation_result
validation_result
and
self
.
_check_single_file
(
and
self
.
_check_single_file
(
...
@@ -290,14 +288,12 @@ class DcatCatalogCheck:
...
@@ -290,14 +288,12 @@ class DcatCatalogCheck:
return
contains_at_least_one_relevant_file
and
validation_result
return
contains_at_least_one_relevant_file
and
validation_result
else
:
else
:
self
.
logger
.
error
(
self
.
logger
.
error
(
f
"
Unsupported container format
{
resource
[
'
mimetype
'
]
}
"
)
f
"
Unsupported container format
{
resource
[
'
mimetype
'
]
}
"
)
def
_check_single_file
(
self
,
resource
,
temp_file
,
format_check_module
):
def
_check_single_file
(
self
,
resource
,
temp_file
,
format_check_module
):
if
format_check_module
:
if
format_check_module
:
# call the function `process` that is defined in every modul
# call the function `process` that is defined in every modul
resource
[
"
valid
"
]
=
format_check_module
.
is_valid
(
resource
[
"
valid
"
]
=
format_check_module
.
is_valid
(
resource
,
temp_file
)
resource
,
temp_file
)
else
:
else
:
# There is no specialized check for the specified format.
# There is no specialized check for the specified format.
# Does the returned MIME type match the promised format?
# Does the returned MIME type match the promised format?
...
@@ -322,8 +318,7 @@ class DcatCatalogCheck:
...
@@ -322,8 +318,7 @@ class DcatCatalogCheck:
):
):
hash_algorithm
=
hashlib
.
md5
()
hash_algorithm
=
hashlib
.
md5
()
else
:
else
:
print
(
print
(
f
"
WARNING: unknown checksum algorithm
{
algo_name
}
"
,
file
=
sys
.
stderr
)
f
"
WARNING: unknown checksum algorithm
{
algo_name
}
"
,
file
=
sys
.
stderr
)
return
return
with
open
(
temp_file
.
name
,
"
rb
"
)
as
f
:
with
open
(
temp_file
.
name
,
"
rb
"
)
as
f
:
...
@@ -418,8 +413,7 @@ class DcatCatalogCheck:
...
@@ -418,8 +413,7 @@ class DcatCatalogCheck:
publisher
=
graph
.
value
(
dataset
,
DCTERMS
.
publisher
)
publisher
=
graph
.
value
(
dataset
,
DCTERMS
.
publisher
)
if
not
publisher
:
if
not
publisher
:
self
.
logger
.
warning
(
self
.
logger
.
warning
(
f
"
Publisher not found for dataset:
{
dataset
}
"
)
f
"
Publisher not found for dataset:
{
dataset
}
"
)
return
None
return
None
# Attempt to get the publisher's name
# Attempt to get the publisher's name
...
@@ -433,8 +427,7 @@ class DcatCatalogCheck:
...
@@ -433,8 +427,7 @@ class DcatCatalogCheck:
except
Exception
as
e
:
except
Exception
as
e
:
# Log any unexpected errors
# Log any unexpected errors
self
.
logger
.
error
(
self
.
logger
.
error
(
f
"
Error retrieving publisher for dataset
{
dataset
}
:
{
e
}
"
)
f
"
Error retrieving publisher for dataset
{
dataset
}
:
{
e
}
"
)
return
None
return
None
def
_process_datasets
(
self
,
datasets
,
g
):
def
_process_datasets
(
self
,
datasets
,
g
):
...
@@ -459,8 +452,7 @@ class DcatCatalogCheck:
...
@@ -459,8 +452,7 @@ class DcatCatalogCheck:
url
=
str
(
resource
[
"
url
"
])
url
=
str
(
resource
[
"
url
"
])
if
self
.
_needs_check
(
url
):
if
self
.
_needs_check
(
url
):
checksum_resource
=
g
.
value
(
checksum_resource
=
g
.
value
(
distribution
,
SPDX
.
checksum
)
distribution
,
SPDX
.
checksum
)
if
checksum_resource
:
if
checksum_resource
:
resource
[
"
checksum_algorithm
"
]
=
str
(
resource
[
"
checksum_algorithm
"
]
=
str
(
g
.
value
(
checksum_resource
,
SPDX
.
algorithm
)
g
.
value
(
checksum_resource
,
SPDX
.
algorithm
)
...
@@ -481,7 +473,8 @@ class DcatCatalogCheck:
...
@@ -481,7 +473,8 @@ class DcatCatalogCheck:
def
read_previous_results
(
self
,
file_path
):
def
read_previous_results
(
self
,
file_path
):
if
not
os
.
path
.
exists
(
file_path
):
if
not
os
.
path
.
exists
(
file_path
):
self
.
logger
.
warning
(
self
.
logger
.
warning
(
f
"
File
'
{
file_path
}
'
does not exist. No previous results loaded.
"
)
f
"
File
'
{
file_path
}
'
does not exist. No previous results loaded.
"
)
return
return
loaded_count
=
0
loaded_count
=
0
...
@@ -500,7 +493,8 @@ class DcatCatalogCheck:
...
@@ -500,7 +493,8 @@ class DcatCatalogCheck:
url
=
json_object
.
get
(
"
url
"
)
url
=
json_object
.
get
(
"
url
"
)
if
not
url
:
if
not
url
:
self
.
logger
.
warning
(
self
.
logger
.
warning
(
f
"
Line
{
line_number
}
is missing
'
url
'
:
{
line
}
"
)
f
"
Line
{
line_number
}
is missing
'
url
'
:
{
line
}
"
)
skipped_count
+=
1
skipped_count
+=
1
continue
continue
...
@@ -508,12 +502,12 @@ class DcatCatalogCheck:
...
@@ -508,12 +502,12 @@ class DcatCatalogCheck:
loaded_count
+=
1
loaded_count
+=
1
except
json
.
JSONDecodeError
as
e
:
except
json
.
JSONDecodeError
as
e
:
self
.
logger
.
error
(
self
.
logger
.
error
(
f
"
Invalid JSON at line
{
line_number
}
:
{
e
}
"
)
f
"
Invalid JSON at line
{
line_number
}
:
{
e
}
"
)
skipped_count
+=
1
skipped_count
+=
1
self
.
logger
.
info
(
self
.
logger
.
info
(
f
"
Loaded
{
loaded_count
}
results from
'
{
file_path
}
'
, skipped
{
skipped_count
}
lines.
"
)
f
"
Loaded
{
loaded_count
}
results from
'
{
file_path
}
'
, skipped
{
skipped_count
}
lines.
"
)
def
read_dcat_catalog
(
self
,
url
):
def
read_dcat_catalog
(
self
,
url
):
while
url
:
while
url
:
...
@@ -536,8 +530,7 @@ class DcatCatalogCheck:
...
@@ -536,8 +530,7 @@ class DcatCatalogCheck:
self
.
_process_datasets
(
datasets
,
g
)
self
.
_process_datasets
(
datasets
,
g
)
paged_collection
=
g
.
value
(
paged_collection
=
g
.
value
(
predicate
=
RDF
.
type
,
object
=
HYDRA
.
PagedCollection
)
predicate
=
RDF
.
type
,
object
=
HYDRA
.
PagedCollection
)
next_page
=
g
.
value
(
paged_collection
,
HYDRA
.
nextPage
)
next_page
=
g
.
value
(
paged_collection
,
HYDRA
.
nextPage
)
url
=
str
(
next_page
)
if
next_page
else
None
url
=
str
(
next_page
)
if
next_page
else
None
...
@@ -562,12 +555,9 @@ if __name__ == "__main__":
...
@@ -562,12 +555,9 @@ if __name__ == "__main__":
parser
=
argparse
.
ArgumentParser
()
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"
--url
"
,
help
=
"
DCAT catalog URL
"
)
parser
.
add_argument
(
"
--url
"
,
help
=
"
DCAT catalog URL
"
)
parser
.
add_argument
(
"
--log_file
"
,
help
=
"
Log file path
"
)
parser
.
add_argument
(
"
--log_file
"
,
help
=
"
Log file path
"
)
parser
.
add_argument
(
parser
.
add_argument
(
"
--results
"
,
help
=
"
File from which the results are loaded
"
)
"
--results
"
,
help
=
"
File from which the results are loaded
"
)
parser
.
add_argument
(
"
--verbose
"
,
action
=
"
store_true
"
,
help
=
"
Enable verbose logging
"
)
parser
.
add_argument
(
"
--verbose
"
,
action
=
"
store_true
"
,
parser
.
add_argument
(
"
--debug
"
,
action
=
"
store_true
"
,
help
=
"
Enable debug logging
"
)
help
=
"
Enable verbose logging
"
)
parser
.
add_argument
(
"
--debug
"
,
action
=
"
store_true
"
,
help
=
"
Enable debug logging
"
)
parser
.
add_argument
(
parser
.
add_argument
(
"
--recheck
"
,
"
--recheck
"
,
action
=
"
store_true
"
,
action
=
"
store_true
"
,
...
@@ -578,8 +568,7 @@ if __name__ == "__main__":
...
@@ -578,8 +568,7 @@ if __name__ == "__main__":
action
=
"
store_true
"
,
action
=
"
store_true
"
,
help
=
"
Just check new entries from the catalog. Do not re-check existing results.
"
,
help
=
"
Just check new entries from the catalog. Do not re-check existing results.
"
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"
--check-format
"
,
help
=
"
Only check the specified format
"
)
"
--check-format
"
,
help
=
"
Only check the specified format
"
)
parser
.
add_argument
(
parser
.
add_argument
(
"
--force-check-format
"
,
"
--force-check-format
"
,
help
=
"
Check distributinons with the specified format regardless of previous results
"
,
help
=
"
Check distributinons with the specified format regardless of previous results
"
,
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment