import pandas as pd import matplotlib.pyplot as plt # Dieses Beispiel erstellt ein Histogram der Nabenhoehe von Windkraftanlagen aus einer csv Datei ################################################################################################### # WITH PROXY #from urllib.request import urlopen, ProxyHandler, build_opener, install_opener #import io # These are proxy settings. If you are behind a proxy just comment them in and swap the ip and port with your proxy #proxy_support = ProxyHandler({"http": "http://<proxy-ip>:<proxy-port>", # "https": "http://<proxy-ip>:<proxy-port>"}) # maybe you need to use https instead of http in the address depending on your proxy settings #opener = build_opener(proxy_support) #install_opener(opener) # The url to download the json #url = "https://opendata.schleswig-holstein.de/collection/windkraftanlagen/aktuell.csv" # store the response of the request and unpack it #response = urlopen(url) #csv_byte = response.read() # read the csv file, interpret ';' as the seperator of columns and ',' as a decimal indicator and convert it to '.' as decimal indicator #df = pd.read_csv(io.StringIO(csv_byte.decode("utf-8")), sep=';', decimal=',') ################################################################################################### # WITHOUT PROXY: # read the csv file, interpret ';' as the seperator of columns and ',' as a decimal indicator and convert it to '.' as decimal indicator in the table df = pd.read_csv("https://opendata.schleswig-holstein.de/collection/windkraftanlagen/aktuell.csv", sep=';', decimal=',') ################################################################################################### # Drop unwanted columns. Comment out the columns to keep. # You can use print(df.columns) to see all available columns df = df.drop(columns=[ 'KREIS', # 'GEMEINDE', 'TYP', 'HERSTELLER', # 'NABENHOEHE', # 'ROTORDURCHMESSER', 'SCHALLLEISTUNGSPEGEL', # 'LEISTUNG', 'LEISTUNGSBEZUG', # 'OSTWERT', 'NORDWERT', 'GENEHMIGT_AM', 'INBETRIEBNAHME', 'STATUS', 'BST_NR', 'ANL_NR', # 'AKTENZEICHEN', 'DATENDATUM', 'DATENQUELLE' ]) # Which column to use for the histogram column_of_interest = "NABENHOEHE" # print the highest 20 values of the column of the dataset print(df.sort_values(column_of_interest, ascending=False).take(range(20))) # plot a histogram for each numerical feature # drop all rows that contain NaN in the specified column df_clean = df.dropna(subset=[column_of_interest]) df_clean[column_of_interest].hist(bins=25, rwidth=.9) # Put labels on the x and y axis plt.xlabel(column_of_interest.title()) plt.ylabel("Anzahl") plt.savefig("histogram.png") plt.show()