import pandas as pd from urllib.request import urlopen, ProxyHandler, build_opener, install_opener import json # This is an example to extract 10 persons with the highest income of each year from a rather difficult designed json # These are proxy settings. If you are behind a proxy just comment them in and swap the ip and port with your proxy proxy_support = ProxyHandler({"http": "http://10.65.117.35:3128", #<proxy-ip>:<proxy-port>", "https": "http://10.65.117.35:3128"}) #<proxy-ip>:<proxy-port>"}) # maybe you need to use https instead of http in the address depending on your proxy settings opener = build_opener(proxy_support) install_opener(opener) # The url to download the json url = "https://phpefi.schleswig-holstein.de/vo/vo_opendata/export.json" # store the response of the request response = urlopen(url) # converting the response into a json data_json = json.loads(response.read()) # Now due to bad data quality and not ideal data format we adjust the json to be better convertable to a table data_json = data_json["Unternehmen"] clean_dict = [] # Iterate over the json file, key is a string and unternehmen is a Json Object # Json Arrays are necessary for the pandas.json_normalize method to work correctly and be able to convert it to a table for key, unternehmen in data_json.items(): list_offenlegungen = [] # go one level deeper, jahr is a string and is the key of the key value pairs # convert all "Offenlegungen" from Json Objects to Json Arrays for jahr in unternehmen["Offenlegungen"]: list_personen = [] # convert all "Personen" from Json Objects to Json Arrays for person in unternehmen["Offenlegungen"][jahr]["Personen"]: # The field "vm_summe_bezuege" needs to be normalized and converted from string to float # First we put "0" for all entries that have an empty string "" tmp = "0" if unternehmen["Offenlegungen"][jahr]["Personen"][person]["vm_summe_bezuege"] == ""\ else unternehmen["Offenlegungen"][jahr]["Personen"][person]["vm_summe_bezuege"] # Now we remove the dots to show thousands, replace the german ',' with the english '.' for decimal numbers, remove other things to sanitize the string, # so we can convert it to a float. This step changes depending on the data. Check the errors of the float conversion to know how to further sanitize the strings. tmp = float(tmp.replace(".", "").replace(",",".").replace(" ", "").replace("€", "").lower().replace("euro", "").replace(".-", "").replace("1:", "").split("(")[0]) unternehmen["Offenlegungen"][jahr]["Personen"][person]["vm_summe_bezuege"] = tmp # append every person to a list to make an json array out of it list_personen.append(unternehmen["Offenlegungen"][jahr]["Personen"][person]) # put the before mentioned list into the json unternehmen["Offenlegungen"][jahr]["Personen"] = list_personen # append every Offenlegung with the new array of Personen into a list list_offenlegungen.append(unternehmen["Offenlegungen"][jahr]) unternehmen["Offenlegungen"] = list_offenlegungen # Remove all "Unternehmen", that do not have any "Offenlegungen" if len(unternehmen["Offenlegungen"]) > 0: clean_dict.append(unternehmen) # Convert Json to dataframe(a Table) # record path is the path to teh deepest level # meta is used to add Fields along the way down to the table df = pd.json_normalize(clean_dict, record_path=["Offenlegungen", "Personen"], meta=["u_name", ["Offenlegungen", "v_jahr"]]) # These are all Fields of "Personen", that should be dropped out of the final table. # You can comment out the fields, that you want to keep. df = df.drop(columns=[ #"vm_id", #"vm_veroeffentlichung_idx", "vm_veroeffentlichung_zulaessig", "vm_angaben_freiwillig", "vm_keine_angaben", #"vm_summe_bezuege", #"vm_summe_zusagen", "vm_anreize", #"vm_bezuege_dritte", "vm_reg_bezuege", "vm_reg_beendigung_rueckstellung", "vm_reg_beendigung_aenderung", "vm_reg_beendigung_voraussetzung", "vm_vorzeitig_bezuege", "vm_vorzeitig_voraussetzungen", "vm_verantwortlich", #"vm_angezeigte_funktion" ]) # Print for every Year the top 10 with the highest "vm_summe_bezuege" in our case the years are saved as strings for jahr in ["2020", "2021", "2022", "2023", "2024"]: # Make a new line and print the year to have a label for each table print("\n", jahr) # This will put all rows with the value jahr in the column "Offenlegung.v_jahr" into one table and save the table in df_jahr df_jahr = df.loc[df["Offenlegungen.v_jahr"] == jahr] # sort the rows using the vm_summe_bezuege column and show them in descending order, then get the first 10 rows df_jahr = df_jahr.sort_values("vm_summe_bezuege", ascending=False).take(range(10)) # print the table print(df_jahr)