example01_json.py

import pandas as pd
from urllib.request import urlopen, ProxyHandler, build_opener, install_opener
import json

# This is an example to extract 10 persons with the highest income of each year from a rather difficult designed json

# These are proxy settings. If you are behind a proxy just comment them in and swap the ip and port with your proxy
proxy_support = ProxyHandler({"http": "http://10.65.117.35:3128", #<proxy-ip>:<proxy-port>",
				"https": "http://10.65.117.35:3128"}) #<proxy-ip>:<proxy-port>"}) # maybe you need to use https instead of http in the address depending on your proxy settings
opener = build_opener(proxy_support)
install_opener(opener)

# The url to download the json
url = "https://phpefi.schleswig-holstein.de/vo/vo_opendata/export.json"

# store the response of the request
response = urlopen(url)

# converting the response into a json
data_json = json.loads(response.read())

# Now due to bad data quality and not ideal data format we adjust the json to be better convertable to a table
data_json = data_json["Unternehmen"]
clean_dict = []

# Iterate over the json file, key is a string and unternehmen is a Json Object
# Json Arrays are necessary for the pandas.json_normalize method to work correctly and be able to convert it to a table
for key, unternehmen in data_json.items():
	list_offenlegungen = []
	# go one level deeper, jahr is a string and is the key of the key value pairs
	# convert all "Offenlegungen" from Json Objects to Json Arrays
	for jahr in unternehmen["Offenlegungen"]:
		list_personen = []
		# convert all "Personen" from Json Objects to Json Arrays
		for person in unternehmen["Offenlegungen"][jahr]["Personen"]:
			# The field "vm_summe_bezuege" needs to be normalized and converted from string to float
			# First we put "0" for all entries that have an empty string ""
			tmp = "0" if unternehmen["Offenlegungen"][jahr]["Personen"][person]["vm_summe_bezuege"] == ""\
				 else unternehmen["Offenlegungen"][jahr]["Personen"][person]["vm_summe_bezuege"]
			# Now we remove the dots to show thousands, replace the german ',' with the english '.' for decimal numbers, remove other things to sanitize the string,
			# so we can convert it to a float. This step changes depending on the data. Check the errors of the float conversion to know how to further sanitize the strings.
			tmp = float(tmp.replace(".", "").replace(",",".").replace(" ", "").replace("€", "").lower().replace("euro", "").replace(".-", "").replace("1:", "").split("(")[0])
			unternehmen["Offenlegungen"][jahr]["Personen"][person]["vm_summe_bezuege"] = tmp
			# append every person to a list to make an json array out of it
			list_personen.append(unternehmen["Offenlegungen"][jahr]["Personen"][person])
		# put the before mentioned list into the json
		unternehmen["Offenlegungen"][jahr]["Personen"] = list_personen
		# append every Offenlegung with the new array of Personen into a list
		list_offenlegungen.append(unternehmen["Offenlegungen"][jahr])
	unternehmen["Offenlegungen"] = list_offenlegungen

	# Remove all "Unternehmen", that do not have any "Offenlegungen"
	if len(unternehmen["Offenlegungen"]) > 0:
		clean_dict.append(unternehmen)

# Convert Json to dataframe(a Table)
# record path is the path to teh deepest level
# meta is used to add Fields along the way down to the table
df = pd.json_normalize(clean_dict,
			record_path=["Offenlegungen", "Personen"],
			meta=["u_name", ["Offenlegungen", "v_jahr"]])

# These are all Fields of "Personen", that should be dropped out of the final table.
# You can comment out the fields, that you want to keep.
df = df.drop(columns=[
	#"vm_id",
	#"vm_veroeffentlichung_idx",
	"vm_veroeffentlichung_zulaessig",
	"vm_angaben_freiwillig",
	"vm_keine_angaben",
	#"vm_summe_bezuege",
	#"vm_summe_zusagen",
	"vm_anreize",
	#"vm_bezuege_dritte",
	"vm_reg_bezuege",
	"vm_reg_beendigung_rueckstellung",
	"vm_reg_beendigung_aenderung",
	"vm_reg_beendigung_voraussetzung",
	"vm_vorzeitig_bezuege",
	"vm_vorzeitig_voraussetzungen",
	"vm_verantwortlich",
	#"vm_angezeigte_funktion"
	])

# Print for every Year the top 10 with the highest "vm_summe_bezuege" in our case the years are saved as strings
for jahr in ["2020", "2021", "2022", "2023", "2024"]:
	# Make a new line and print the year to have a label for each table
	print("\n", jahr)
	# This will put all rows with the value jahr in the column "Offenlegung.v_jahr" into one table and save the table in df_jahr
	df_jahr = df.loc[df["Offenlegungen.v_jahr"] == jahr]
	# sort the rows using the vm_summe_bezuege column and show them in descending order, then get the first 10 rows
	df_jahr = df_jahr.sort_values("vm_summe_bezuege", ascending=False).take(range(10))
	# print the table
	print(df_jahr)