import pandas as pd
import matplotlib.pyplot as plt

# Dieses Beispiel erstellt ein Histogram der Nabenhoehe von Windkraftanlagen aus einer csv Datei

###################################################################################################
# WITH PROXY
#from urllib.request import urlopen, ProxyHandler, build_opener, install_opener
#import io

# These are proxy settings. If you are behind a proxy just comment them in and swap the ip and port with your proxy
#proxy_support = ProxyHandler({"http": "http://<proxy-ip>:<proxy-port>",
#                                "https": "http://<proxy-ip>:<proxy-port>"}) # maybe you need to use https instead of http in the address depending on your proxy settings
#opener = build_opener(proxy_support)
#install_opener(opener)

# The url to download the json
#url = "https://opendata.schleswig-holstein.de/collection/windkraftanlagen/aktuell.csv"

# store the response of the request and unpack it
#response = urlopen(url)
#csv_byte = response.read()
# read the csv file, interpret ';' as the seperator of columns and ',' as a decimal indicator and convert it to '.' as decimal indicator
#df = pd.read_csv(io.StringIO(csv_byte.decode("utf-8")), sep=';', decimal=',')

###################################################################################################
# WITHOUT PROXY:
# read the csv file, interpret ';' as the seperator of columns and ',' as a decimal indicator and convert it to '.' as decimal indicator in the table
df = pd.read_csv("https://opendata.schleswig-holstein.de/collection/windkraftanlagen/aktuell.csv", sep=';', decimal=',')

###################################################################################################

# Drop unwanted columns. Comment out the columns to keep.
# You can use print(df.columns) to see all available columns
df = df.drop(columns=[ 'KREIS',
#	'GEMEINDE',
	 'TYP', 'HERSTELLER',
#	'NABENHOEHE',
#	'ROTORDURCHMESSER',
	'SCHALLLEISTUNGSPEGEL',
#	'LEISTUNG',
	'LEISTUNGSBEZUG',
#	'OSTWERT', 'NORDWERT', 'GENEHMIGT_AM',
	'INBETRIEBNAHME', 'STATUS', 'BST_NR', 'ANL_NR',
#	'AKTENZEICHEN',
	'DATENDATUM', 'DATENQUELLE'
])

# Which column to use for the histogram
column_of_interest = "NABENHOEHE"

# print the highest 20 values of the column of the dataset
print(df.sort_values(column_of_interest, ascending=False).take(range(20)))

# plot a histogram for each numerical feature
# drop all rows that contain NaN in the specified column
df_clean = df.dropna(subset=[column_of_interest])
df_clean[column_of_interest].hist(bins=25, rwidth=.9)

# Put labels on the x and y axis
plt.xlabel(column_of_interest.title())
plt.ylabel("Anzahl")
plt.savefig("histogram.png")
plt.show()