Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import pandas as pd
import matplotlib.pyplot as plt
# Dieses Beispiel erstellt ein Histogram der Nabenhoehe von Windkraftanlagen aus einer csv Datei
###################################################################################################
# WITH PROXY
#from urllib.request import urlopen, ProxyHandler, build_opener, install_opener
#import io
# These are proxy settings. If you are behind a proxy just comment them in and swap the ip and port with your proxy
#proxy_support = ProxyHandler({"http": "http://<proxy-ip>:<proxy-port>",
# "https": "http://<proxy-ip>:<proxy-port>"}) # maybe you need to use https instead of http in the address depending on your proxy settings
#opener = build_opener(proxy_support)
#install_opener(opener)
# The url to download the json
#url = "https://opendata.schleswig-holstein.de/collection/windkraftanlagen/aktuell.csv"
# store the response of the request and unpack it
#response = urlopen(url)
#csv_byte = response.read()
# read the csv file, interpret ';' as the seperator of columns and ',' as a decimal indicator and convert it to '.' as decimal indicator
#df = pd.read_csv(io.StringIO(csv_byte.decode("utf-8")), sep=';', decimal=',')
###################################################################################################
# WITHOUT PROXY:
# read the csv file, interpret ';' as the seperator of columns and ',' as a decimal indicator and convert it to '.' as decimal indicator in the table
df = pd.read_csv("https://opendata.schleswig-holstein.de/collection/windkraftanlagen/aktuell.csv", sep=';', decimal=',')
###################################################################################################
# Drop unwanted columns. Comment out the columns to keep.
# You can use print(df.columns) to see all available columns
df = df.drop(columns=[ 'KREIS',
# 'GEMEINDE',
'TYP', 'HERSTELLER',
# 'NABENHOEHE',
# 'ROTORDURCHMESSER',
'SCHALLLEISTUNGSPEGEL',
# 'LEISTUNG',
'LEISTUNGSBEZUG',
# 'OSTWERT', 'NORDWERT', 'GENEHMIGT_AM',
'INBETRIEBNAHME', 'STATUS', 'BST_NR', 'ANL_NR',
# 'AKTENZEICHEN',
'DATENDATUM', 'DATENQUELLE'
])
# Which column to use for the histogram
column_of_interest = "NABENHOEHE"
# print the highest 20 values of the column of the dataset
print(df.sort_values(column_of_interest, ascending=False).take(range(20)))
# plot a histogram for each numerical feature
# drop all rows that contain NaN in the specified column
df_clean = df.dropna(subset=[column_of_interest])
df_clean[column_of_interest].hist(bins=25, rwidth=.9)
# Put labels on the x and y axis
plt.xlabel(column_of_interest.title())
plt.ylabel("Anzahl")
plt.savefig("histogram.png")
plt.show()