Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import pandas as pd
from urllib.request import urlopen, ProxyHandler, build_opener, install_opener
import json
# This is an example to extract 10 persons with the highest income of each year from a rather difficult designed json
# These are proxy settings. If you are behind a proxy just comment them in and swap the ip and port with your proxy
proxy_support = ProxyHandler({"http": "http://10.65.117.35:3128", #<proxy-ip>:<proxy-port>",
"https": "http://10.65.117.35:3128"}) #<proxy-ip>:<proxy-port>"}) # maybe you need to use https instead of http in the address depending on your proxy settings
opener = build_opener(proxy_support)
install_opener(opener)
# The url to download the json
url = "https://phpefi.schleswig-holstein.de/vo/vo_opendata/export.json"
# store the response of the request
response = urlopen(url)
# converting the response into a json
data_json = json.loads(response.read())
# Now due to bad data quality and not ideal data format we adjust the json to be better convertable to a table
data_json = data_json["Unternehmen"]
clean_dict = []
# Iterate over the json file, key is a string and unternehmen is a Json Object
# Json Arrays are necessary for the pandas.json_normalize method to work correctly and be able to convert it to a table
for key, unternehmen in data_json.items():
list_offenlegungen = []
# go one level deeper, jahr is a string and is the key of the key value pairs
# convert all "Offenlegungen" from Json Objects to Json Arrays
for jahr in unternehmen["Offenlegungen"]:
list_personen = []
# convert all "Personen" from Json Objects to Json Arrays
for person in unternehmen["Offenlegungen"][jahr]["Personen"]:
# The field "vm_summe_bezuege" needs to be normalized and converted from string to float
# First we put "0" for all entries that have an empty string ""
tmp = "0" if unternehmen["Offenlegungen"][jahr]["Personen"][person]["vm_summe_bezuege"] == ""\
else unternehmen["Offenlegungen"][jahr]["Personen"][person]["vm_summe_bezuege"]
# Now we remove the dots to show thousands, replace the german ',' with the english '.' for decimal numbers, remove other things to sanitize the string,
# so we can convert it to a float. This step changes depending on the data. Check the errors of the float conversion to know how to further sanitize the strings.
tmp = float(tmp.replace(".", "").replace(",",".").replace(" ", "").replace("€", "").lower().replace("euro", "").replace(".-", "").replace("1:", "").split("(")[0])
unternehmen["Offenlegungen"][jahr]["Personen"][person]["vm_summe_bezuege"] = tmp
# append every person to a list to make an json array out of it
list_personen.append(unternehmen["Offenlegungen"][jahr]["Personen"][person])
# put the before mentioned list into the json
unternehmen["Offenlegungen"][jahr]["Personen"] = list_personen
# append every Offenlegung with the new array of Personen into a list
list_offenlegungen.append(unternehmen["Offenlegungen"][jahr])
unternehmen["Offenlegungen"] = list_offenlegungen
# Remove all "Unternehmen", that do not have any "Offenlegungen"
if len(unternehmen["Offenlegungen"]) > 0:
clean_dict.append(unternehmen)
# Convert Json to dataframe(a Table)
# record path is the path to teh deepest level
# meta is used to add Fields along the way down to the table
df = pd.json_normalize(clean_dict,
record_path=["Offenlegungen", "Personen"],
meta=["u_name", ["Offenlegungen", "v_jahr"]])
# These are all Fields of "Personen", that should be dropped out of the final table.
# You can comment out the fields, that you want to keep.
df = df.drop(columns=[
#"vm_id",
#"vm_veroeffentlichung_idx",
"vm_veroeffentlichung_zulaessig",
"vm_angaben_freiwillig",
"vm_keine_angaben",
#"vm_summe_bezuege",
#"vm_summe_zusagen",
"vm_anreize",
#"vm_bezuege_dritte",
"vm_reg_bezuege",
"vm_reg_beendigung_rueckstellung",
"vm_reg_beendigung_aenderung",
"vm_reg_beendigung_voraussetzung",
"vm_vorzeitig_bezuege",
"vm_vorzeitig_voraussetzungen",
"vm_verantwortlich",
#"vm_angezeigte_funktion"
])
# Print for every Year the top 10 with the highest "vm_summe_bezuege" in our case the years are saved as strings
for jahr in ["2020", "2021", "2022", "2023", "2024"]:
# Make a new line and print the year to have a label for each table
print("\n", jahr)
# This will put all rows with the value jahr in the column "Offenlegung.v_jahr" into one table and save the table in df_jahr
df_jahr = df.loc[df["Offenlegungen.v_jahr"] == jahr]
# sort the rows using the vm_summe_bezuege column and show them in descending order, then get the first 10 rows
df_jahr = df_jahr.sort_values("vm_summe_bezuege", ascending=False).take(range(10))
# print the table
print(df_jahr)