Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
D
DCAT Catalog Check
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Open-Data
DCAT Catalog Check
Commits
e7b06056
Commit
e7b06056
authored
3 months ago
by
Thorge Petersen
Browse files
Options
Downloads
Patches
Plain Diff
fix: keys in _clear_result() and code formatting
parent
ca271b42
No related branches found
No related tags found
1 merge request
!1
Update Formats, Dependencies, and Dockerfile Configuration
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
dcat_catalog_check.py
+36
-18
36 additions, 18 deletions
dcat_catalog_check.py
with
36 additions
and
18 deletions
dcat_catalog_check.py
+
36
−
18
View file @
e7b06056
...
...
@@ -154,7 +154,8 @@ class DcatCatalogCheck:
"
error
"
,
"
etag
"
,
"
http_status
"
,
"
last_check
"
"
mimetype
"
,
"
last_check
"
,
"
mimetype
"
,
"
mimetype_mismatch
"
,
"
valid
"
,
]:
...
...
@@ -174,7 +175,8 @@ class DcatCatalogCheck:
format
=
resource
[
"
format
"
].
lower
()
try
:
# dynamically import the corresponding module for the format
format_check_module
=
importlib
.
import_module
(
f
"
formats.
{
format
}
_format
"
)
format_check_module
=
importlib
.
import_module
(
f
"
formats.
{
format
}
_format
"
)
except
ModuleNotFoundError
:
format_check_module
=
None
...
...
@@ -212,7 +214,8 @@ class DcatCatalogCheck:
# write the content of the HTTP response into a temporary file
original_file_name
=
url
.
split
(
"
/
"
)[
-
1
]
suffix
=
original_file_name
.
split
(
"
.
"
)[
-
1
]
if
"
.
"
in
original_file_name
else
""
suffix
=
original_file_name
.
split
(
"
.
"
)[
-
1
]
if
"
.
"
in
original_file_name
else
""
with
tempfile
.
NamedTemporaryFile
(
delete
=
False
,
suffix
=
"
.
"
+
suffix
)
as
temp_file
:
...
...
@@ -235,7 +238,8 @@ class DcatCatalogCheck:
decompressor
=
decompressors
.
get
(
resource
[
"
mimetype
"
])
if
not
decompressor
:
self
.
logger
.
warning
(
f
"
Unknown compression
{
resource
[
'
mimetype
'
]
}
.
"
)
self
.
logger
.
warning
(
f
"
Unknown compression
{
resource
[
'
mimetype
'
]
}
.
"
)
else
:
with
tempfile
.
NamedTemporaryFile
(
delete
=
False
)
as
decompressed_file
:
with
decompressor
.
open
(
temp_file
.
name
,
"
rb
"
)
as
compressed_file
:
...
...
@@ -246,7 +250,8 @@ class DcatCatalogCheck:
resource
[
"
mimetype
"
]
=
self
.
_guess_mime_type
(
temp_file
.
name
)
if
self
.
_is_container
(
resource
[
"
mimetype
"
],
resource
[
"
format
"
]):
self
.
_check_container_file
(
resource
,
temp_file
,
format_check_module
)
self
.
_check_container_file
(
resource
,
temp_file
,
format_check_module
)
else
:
self
.
_check_single_file
(
resource
,
temp_file
,
format_check_module
)
...
...
@@ -274,7 +279,8 @@ class DcatCatalogCheck:
temp_file
.
write
(
file
.
read
())
temp_file
.
flush
()
resource
[
"
mimetype
"
]
=
self
.
_guess_mime_type
(
temp_file
.
name
)
resource
[
"
mimetype
"
]
=
self
.
_guess_mime_type
(
temp_file
.
name
)
validation_result
=
(
validation_result
and
self
.
_check_single_file
(
...
...
@@ -288,12 +294,14 @@ class DcatCatalogCheck:
return
contains_at_least_one_relevant_file
and
validation_result
else
:
self
.
logger
.
error
(
f
"
Unsupported container format
{
resource
[
'
mimetype
'
]
}
"
)
self
.
logger
.
error
(
f
"
Unsupported container format
{
resource
[
'
mimetype
'
]
}
"
)
def
_check_single_file
(
self
,
resource
,
temp_file
,
format_check_module
):
if
format_check_module
:
# call the function `process` that is defined in every modul
resource
[
"
valid
"
]
=
format_check_module
.
is_valid
(
resource
,
temp_file
)
resource
[
"
valid
"
]
=
format_check_module
.
is_valid
(
resource
,
temp_file
)
else
:
# There is no specialized check for the specified format.
# Does the returned MIME type match the promised format?
...
...
@@ -318,7 +326,8 @@ class DcatCatalogCheck:
):
hash_algorithm
=
hashlib
.
md5
()
else
:
print
(
f
"
WARNING: unknown checksum algorithm
{
algo_name
}
"
,
file
=
sys
.
stderr
)
print
(
f
"
WARNING: unknown checksum algorithm
{
algo_name
}
"
,
file
=
sys
.
stderr
)
return
with
open
(
temp_file
.
name
,
"
rb
"
)
as
f
:
...
...
@@ -413,7 +422,8 @@ class DcatCatalogCheck:
publisher
=
graph
.
value
(
dataset
,
DCTERMS
.
publisher
)
if
not
publisher
:
self
.
logger
.
warning
(
f
"
Publisher not found for dataset:
{
dataset
}
"
)
self
.
logger
.
warning
(
f
"
Publisher not found for dataset:
{
dataset
}
"
)
return
None
# Attempt to get the publisher's name
...
...
@@ -427,7 +437,8 @@ class DcatCatalogCheck:
except
Exception
as
e
:
# Log any unexpected errors
self
.
logger
.
error
(
f
"
Error retrieving publisher for dataset
{
dataset
}
:
{
e
}
"
)
self
.
logger
.
error
(
f
"
Error retrieving publisher for dataset
{
dataset
}
:
{
e
}
"
)
return
None
def
_process_datasets
(
self
,
datasets
,
g
):
...
...
@@ -452,7 +463,8 @@ class DcatCatalogCheck:
url
=
str
(
resource
[
"
url
"
])
if
self
.
_needs_check
(
url
):
checksum_resource
=
g
.
value
(
distribution
,
SPDX
.
checksum
)
checksum_resource
=
g
.
value
(
distribution
,
SPDX
.
checksum
)
if
checksum_resource
:
resource
[
"
checksum_algorithm
"
]
=
str
(
g
.
value
(
checksum_resource
,
SPDX
.
algorithm
)
...
...
@@ -502,7 +514,8 @@ class DcatCatalogCheck:
loaded_count
+=
1
except
json
.
JSONDecodeError
as
e
:
self
.
logger
.
error
(
f
"
Invalid JSON at line
{
line_number
}
:
{
e
}
"
)
self
.
logger
.
error
(
f
"
Invalid JSON at line
{
line_number
}
:
{
e
}
"
)
skipped_count
+=
1
self
.
logger
.
info
(
...
...
@@ -530,7 +543,8 @@ class DcatCatalogCheck:
self
.
_process_datasets
(
datasets
,
g
)
paged_collection
=
g
.
value
(
predicate
=
RDF
.
type
,
object
=
HYDRA
.
PagedCollection
)
paged_collection
=
g
.
value
(
predicate
=
RDF
.
type
,
object
=
HYDRA
.
PagedCollection
)
next_page
=
g
.
value
(
paged_collection
,
HYDRA
.
nextPage
)
url
=
str
(
next_page
)
if
next_page
else
None
...
...
@@ -555,9 +569,12 @@ if __name__ == "__main__":
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"
--url
"
,
help
=
"
DCAT catalog URL
"
)
parser
.
add_argument
(
"
--log_file
"
,
help
=
"
Log file path
"
)
parser
.
add_argument
(
"
--results
"
,
help
=
"
File from which the results are loaded
"
)
parser
.
add_argument
(
"
--verbose
"
,
action
=
"
store_true
"
,
help
=
"
Enable verbose logging
"
)
parser
.
add_argument
(
"
--debug
"
,
action
=
"
store_true
"
,
help
=
"
Enable debug logging
"
)
parser
.
add_argument
(
"
--results
"
,
help
=
"
File from which the results are loaded
"
)
parser
.
add_argument
(
"
--verbose
"
,
action
=
"
store_true
"
,
help
=
"
Enable verbose logging
"
)
parser
.
add_argument
(
"
--debug
"
,
action
=
"
store_true
"
,
help
=
"
Enable debug logging
"
)
parser
.
add_argument
(
"
--recheck
"
,
action
=
"
store_true
"
,
...
...
@@ -568,7 +585,8 @@ if __name__ == "__main__":
action
=
"
store_true
"
,
help
=
"
Just check new entries from the catalog. Do not re-check existing results.
"
,
)
parser
.
add_argument
(
"
--check-format
"
,
help
=
"
Only check the specified format
"
)
parser
.
add_argument
(
"
--check-format
"
,
help
=
"
Only check the specified format
"
)
parser
.
add_argument
(
"
--force-check-format
"
,
help
=
"
Check distributinons with the specified format regardless of previous results
"
,
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment