Skip to content
Snippets Groups Projects
example01_json.py 4.62 KiB
Newer Older
  • Learn to ignore specific revisions
  • root's avatar
    root committed
    import pandas as pd
    from urllib.request import urlopen, ProxyHandler, build_opener, install_opener
    import json
    
    # This is an example to extract 10 persons with the highest income of each year from a rather difficult designed json
    
    # These are proxy settings. If you are behind a proxy just comment them in and swap the ip and port with your proxy
    proxy_support = ProxyHandler({"http": "http://10.65.117.35:3128", #<proxy-ip>:<proxy-port>",
    				"https": "http://10.65.117.35:3128"}) #<proxy-ip>:<proxy-port>"}) # maybe you need to use https instead of http in the address depending on your proxy settings
    opener = build_opener(proxy_support)
    install_opener(opener)
    
    # The url to download the json
    url = "https://phpefi.schleswig-holstein.de/vo/vo_opendata/export.json"
    
    # store the response of the request
    response = urlopen(url)
    
    # converting the response into a json
    data_json = json.loads(response.read())
    
    # Now due to bad data quality and not ideal data format we adjust the json to be better convertable to a table
    data_json = data_json["Unternehmen"]
    clean_dict = []
    
    # Iterate over the json file, key is a string and unternehmen is a Json Object
    # Json Arrays are necessary for the pandas.json_normalize method to work correctly and be able to convert it to a table
    for key, unternehmen in data_json.items():
    	list_offenlegungen = []
    	# go one level deeper, jahr is a string and is the key of the key value pairs
    	# convert all "Offenlegungen" from Json Objects to Json Arrays
    	for jahr in unternehmen["Offenlegungen"]:
    		list_personen = []
    		# convert all "Personen" from Json Objects to Json Arrays
    		for person in unternehmen["Offenlegungen"][jahr]["Personen"]:
    			# The field "vm_summe_bezuege" needs to be normalized and converted from string to float
    			# First we put "0" for all entries that have an empty string ""
    			tmp = "0" if unternehmen["Offenlegungen"][jahr]["Personen"][person]["vm_summe_bezuege"] == ""\
    				 else unternehmen["Offenlegungen"][jahr]["Personen"][person]["vm_summe_bezuege"]
    			# Now we remove the dots to show thousands, replace the german ',' with the english '.' for decimal numbers, remove other things to sanitize the string,
    			# so we can convert it to a float. This step changes depending on the data. Check the errors of the float conversion to know how to further sanitize the strings.
    			tmp = float(tmp.replace(".", "").replace(",",".").replace(" ", "").replace("", "").lower().replace("euro", "").replace(".-", "").replace("1:", "").split("(")[0])
    			unternehmen["Offenlegungen"][jahr]["Personen"][person]["vm_summe_bezuege"] = tmp
    			# append every person to a list to make an json array out of it
    			list_personen.append(unternehmen["Offenlegungen"][jahr]["Personen"][person])
    		# put the before mentioned list into the json
    		unternehmen["Offenlegungen"][jahr]["Personen"] = list_personen
    		# append every Offenlegung with the new array of Personen into a list
    		list_offenlegungen.append(unternehmen["Offenlegungen"][jahr])
    	unternehmen["Offenlegungen"] = list_offenlegungen
    
    	# Remove all "Unternehmen", that do not have any "Offenlegungen"
    	if len(unternehmen["Offenlegungen"]) > 0:
    		clean_dict.append(unternehmen)
    
    # Convert Json to dataframe(a Table)
    # record path is the path to teh deepest level
    # meta is used to add Fields along the way down to the table
    df = pd.json_normalize(clean_dict,
    			record_path=["Offenlegungen", "Personen"],
    			meta=["u_name", ["Offenlegungen", "v_jahr"]])
    
    # These are all Fields of "Personen", that should be dropped out of the final table.
    # You can comment out the fields, that you want to keep.
    df = df.drop(columns=[
    	#"vm_id",
    	#"vm_veroeffentlichung_idx",
    	"vm_veroeffentlichung_zulaessig",
    	"vm_angaben_freiwillig",
    	"vm_keine_angaben",
    	#"vm_summe_bezuege",
    	#"vm_summe_zusagen",
    	"vm_anreize",
    	#"vm_bezuege_dritte",
    	"vm_reg_bezuege",
    	"vm_reg_beendigung_rueckstellung",
    	"vm_reg_beendigung_aenderung",
    	"vm_reg_beendigung_voraussetzung",
    	"vm_vorzeitig_bezuege",
    	"vm_vorzeitig_voraussetzungen",
    	"vm_verantwortlich",
    	#"vm_angezeigte_funktion"
    	])
    
    # Print for every Year the top 10 with the highest "vm_summe_bezuege" in our case the years are saved as strings
    for jahr in ["2020", "2021", "2022", "2023", "2024"]:
    	# Make a new line and print the year to have a label for each table
    	print("\n", jahr)
    	# This will put all rows with the value jahr in the column "Offenlegung.v_jahr" into one table and save the table in df_jahr
    	df_jahr = df.loc[df["Offenlegungen.v_jahr"] == jahr]
    	# sort the rows using the vm_summe_bezuege column and show them in descending order, then get the first 10 rows
    	df_jahr = df_jahr.sort_values("vm_summe_bezuege", ascending=False).take(range(10))
    	# print the table
    	print(df_jahr)