Skip to content
Snippets Groups Projects
example02_csv.py 2.62 KiB
Newer Older
  • Learn to ignore specific revisions
  • root's avatar
    root committed
    import pandas as pd
    import matplotlib.pyplot as plt
    
    # Dieses Beispiel erstellt ein Histogram der Nabenhoehe von Windkraftanlagen aus einer csv Datei
    
    ###################################################################################################
    # WITH PROXY
    #from urllib.request import urlopen, ProxyHandler, build_opener, install_opener
    #import io
    
    # These are proxy settings. If you are behind a proxy just comment them in and swap the ip and port with your proxy
    #proxy_support = ProxyHandler({"http": "http://<proxy-ip>:<proxy-port>",
    #                                "https": "http://<proxy-ip>:<proxy-port>"}) # maybe you need to use https instead of http in the address depending on your proxy settings
    #opener = build_opener(proxy_support)
    #install_opener(opener)
    
    # The url to download the json
    #url = "https://opendata.schleswig-holstein.de/collection/windkraftanlagen/aktuell.csv"
    
    # store the response of the request and unpack it
    #response = urlopen(url)
    #csv_byte = response.read()
    # read the csv file, interpret ';' as the seperator of columns and ',' as a decimal indicator and convert it to '.' as decimal indicator
    #df = pd.read_csv(io.StringIO(csv_byte.decode("utf-8")), sep=';', decimal=',')
    
    ###################################################################################################
    # WITHOUT PROXY:
    # read the csv file, interpret ';' as the seperator of columns and ',' as a decimal indicator and convert it to '.' as decimal indicator in the table
    df = pd.read_csv("https://opendata.schleswig-holstein.de/collection/windkraftanlagen/aktuell.csv", sep=';', decimal=',')
    
    ###################################################################################################
    
    # Drop unwanted columns. Comment out the columns to keep.
    # You can use print(df.columns) to see all available columns
    df = df.drop(columns=[ 'KREIS',
    #	'GEMEINDE',
    	 'TYP', 'HERSTELLER',
    #	'NABENHOEHE',
    #	'ROTORDURCHMESSER',
    	'SCHALLLEISTUNGSPEGEL',
    #	'LEISTUNG',
    	'LEISTUNGSBEZUG',
    #	'OSTWERT', 'NORDWERT', 'GENEHMIGT_AM',
    	'INBETRIEBNAHME', 'STATUS', 'BST_NR', 'ANL_NR',
    #	'AKTENZEICHEN',
    	'DATENDATUM', 'DATENQUELLE'
    ])
    
    # Which column to use for the histogram
    column_of_interest = "NABENHOEHE"
    
    # print the highest 20 values of the column of the dataset
    print(df.sort_values(column_of_interest, ascending=False).take(range(20)))
    
    # plot a histogram for each numerical feature
    # drop all rows that contain NaN in the specified column
    df_clean = df.dropna(subset=[column_of_interest])
    df_clean[column_of_interest].hist(bins=25, rwidth=.9)
    
    # Put labels on the x and y axis
    plt.xlabel(column_of_interest.title())
    plt.ylabel("Anzahl")
    plt.savefig("histogram.png")
    plt.show()