diff --git a/docker/Dockerfile b/docker/Dockerfile
index f7abd485039336560f6acfa14265339dce7d3584..645737d087b1f7318b2a800639d1a143f359f645 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,15 +1,15 @@
 # syntax = docker/dockerfile:1.3
-FROM nvidia/cuda:11.6.2-base-ubuntu20.04
+FROM pytorch/pytorch:2.1.2-cuda11.8-cudnn8-runtime
 RUN apt update && apt install -y \
-    pip \
+    git \
     && rm -rf /var/lib/apt/lists/*
-COPY requirements.txt /tmp/requirements.txt
+COPY ../requirements.txt /tmp/requirements.txt
 RUN pip install --upgrade pip && \
     pip install --no-cache-dir -r /tmp/requirements.txt
-WORKDIR /home/wavo-torch/notebooks
-CMD jupyter-lab --no-browser --ip 0.0.0.0 --port 8888 --allow-root
-#--notebook-dir=/tf 
-LABEL name="kiwavo-tk" version="0.0.4" maintainer="Michel Spils <msp@informatik.uni-kiel.de>"
-# docker run -u $(id -u):$(id -g) --gpus all --rm -it -p 9999:9999 --name wln --workdir /../app/ki-wavo/notebooks 
-# --mount type=bind,source="$(pwd)",target=/app  kiwavo/notebook:latest jupyter-lab --ip 0.0.0.0 --port 9999
-#pytorch-forecasting==1.0.0
+ARG USERNAME=mspils
+ARG USER_UID=5006
+ARG USER_GID=$USER_UID
+# Create the user
+RUN groupadd --gid $USER_GID $USERNAME \
+    && useradd --uid $USER_UID --gid $USER_GID -m $USERNAME
+LABEL name="wavo-dev" version="0.0.2" maintainer="Michel Spils <msp@informatik.uni-kiel.de>"
\ No newline at end of file
diff --git a/docker/Dockerfile_devcontainer b/docker/Dockerfile_devcontainer
deleted file mode 100644
index df457b74297ed213207cf3db05efe025fa366713..0000000000000000000000000000000000000000
--- a/docker/Dockerfile_devcontainer
+++ /dev/null
@@ -1,15 +0,0 @@
-# syntax = docker/dockerfile:1.3
-FROM pytorch/pytorch:2.1.2-cuda11.8-cudnn8-runtime
-RUN apt update && apt install -y \
-    git \
-    && rm -rf /var/lib/apt/lists/*
-COPY requirements.txt /tmp/requirements.txt
-RUN pip install --upgrade pip && \
-    pip install --no-cache-dir -r /tmp/requirements.txt
-ARG USERNAME=mspils
-ARG USER_UID=5006
-ARG USER_GID=$USER_UID
-# Create the user
-RUN groupadd --gid $USER_GID $USERNAME \
-    && useradd --uid $USER_UID --gid $USER_GID -m $USERNAME
-LABEL name="wavo-dev" version="0.0.2" maintainer="Michel Spils <msp@informatik.uni-kiel.de>"
\ No newline at end of file
diff --git a/docker/Dockerfile_old b/docker/Dockerfile_old
new file mode 100644
index 0000000000000000000000000000000000000000..f7abd485039336560f6acfa14265339dce7d3584
--- /dev/null
+++ b/docker/Dockerfile_old
@@ -0,0 +1,15 @@
+# syntax = docker/dockerfile:1.3
+FROM nvidia/cuda:11.6.2-base-ubuntu20.04
+RUN apt update && apt install -y \
+    pip \
+    && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt /tmp/requirements.txt
+RUN pip install --upgrade pip && \
+    pip install --no-cache-dir -r /tmp/requirements.txt
+WORKDIR /home/wavo-torch/notebooks
+CMD jupyter-lab --no-browser --ip 0.0.0.0 --port 8888 --allow-root
+#--notebook-dir=/tf 
+LABEL name="kiwavo-tk" version="0.0.4" maintainer="Michel Spils <msp@informatik.uni-kiel.de>"
+# docker run -u $(id -u):$(id -g) --gpus all --rm -it -p 9999:9999 --name wln --workdir /../app/ki-wavo/notebooks 
+# --mount type=bind,source="$(pwd)",target=/app  kiwavo/notebook:latest jupyter-lab --ip 0.0.0.0 --port 9999
+#pytorch-forecasting==1.0.0
diff --git a/notebooks/scraping.ipynb b/notebooks/scraping.ipynb
index cdc55f1c38bd128f28d085047336b27af766a273..01b1858fea8bac1ba899f71f743daaa93c3ec830 100644
--- a/notebooks/scraping.ipynb
+++ b/notebooks/scraping.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 113,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -11,12 +11,20 @@
     "from pathlib import Path\n",
     "import pandas as pd\n",
     "from tqdm import tqdm\n",
-    "import matplotlib.pyplot as plt"
+    "import matplotlib.pyplot as plt\n",
+    "import requests\n",
+    "import os\n",
+    "from bs4 import BeautifulSoup\n",
+    "from urllib.parse import urljoin\n",
+    "import zipfile\n",
+    "import io\n",
+    "import pandas as pd\n",
+    "from datetime import datetime, timedelta\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": []
@@ -33,17 +41,206 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 112,
+   "metadata": {},
+   "outputs": [],
    "source": [
-    "dwd_base_url = \"https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/hourly/\"\n",
-    "sensor_types = [\"air_temperature\",\"dew_point\",\"moisture\",\"precipitation\",\"pressure\",\"sun\",\"wind\"]\n"
+    "\n",
+    "def read_and_filter_stations(url):\n",
+    "    response = requests.get(url)\n",
+    "    soup = BeautifulSoup(response.content, 'html.parser')\n",
+    "    meta_data = [link.get('href') for link in soup.find_all('a') if link.get('href', '').endswith('.txt')][0]\n",
+    "    f = requests.get(urljoin(url, meta_data))\n",
+    "\n",
+    "\n",
+    "\n",
+    "    df = pd.read_fwf(io.StringIO(f.text), colspecs=[(0, 5), (6, 14), (15, 29), (30, 40), (42, 52), (53, 60), (61, 102), (102, 128), (129, 148)],\n",
+    "                        names=['Stations_id', 'von_datum', 'bis_datum', 'Stationshoehe', 'geoBreite', 'geoLaenge', 'Stationsname', 'Bundesland', 'Abgabe'],\n",
+    "                        skiprows=2,\n",
+    "                        dtype={0: \"str\"})\n",
+    "    # Convert date columns to datetime\n",
+    "    df['von_datum'] = pd.to_datetime(df['von_datum'], format='%Y%m%d')\n",
+    "    df['bis_datum'] = pd.to_datetime(df['bis_datum'], format='%Y%m%d')\n",
+    "\n",
+    "    # Calculate the date 5 years ago from today\n",
+    "    five_years_ago = datetime.now() - timedelta(days=5*365)\n",
+    "\n",
+    "    # Filter rows where 'bis_datum' is within the last 5 years\n",
+    "    df = df[df['von_datum'] < five_years_ago]\n",
+    "    df = df[df['bis_datum'] > five_years_ago]\n",
+    "    return df\n",
+    "\n",
+    "def download_and_extract_data(url, output_dir, stations_df):\n",
+    "    # Create output directory if it doesn't exist\n",
+    "    output_dir.mkdir(exist_ok=True)\n",
+    "\n",
+    "    # Fetch the webpage content\n",
+    "    response = requests.get(url)\n",
+    "    soup = BeautifulSoup(response.content, 'html.parser')\n",
+    "\n",
+    "    # Find all links to zip files\n",
+    "    zip_links = [link.get('href') for link in soup.find_all('a') if link.get('href', '').endswith('.zip')]\n",
+    "\n",
+    "    for zip_link in  tqdm(zip_links):\n",
+    "        # Extract station ID from the zip file name\n",
+    "        station_id = zip_link.split('_')[2]#.split('.')[0]\n",
+    "        # Check if this station ID is in our filtered dataframe\n",
+    "        if station_id in stations_df['Stations_id'].values:\n",
+    "            full_url = urljoin(url, zip_link)\n",
+    "            # Download the zip file\n",
+    "            zip_response = requests.get(full_url)\n",
+    "            \n",
+    "            # Extract the zip file in memory\n",
+    "            with zipfile.ZipFile(io.BytesIO(zip_response.content)) as zip_ref:\n",
+    "                # Find the file starting with 'produkt_'\n",
+    "                produkt_file = next((name for name in zip_ref.namelist() if name.startswith('produkt_')), None)\n",
+    "                if produkt_file:\n",
+    "                    # Extract, process and save the 'produkt_' file\n",
+    "                    with zip_ref.open(produkt_file) as source:\n",
+    "                        process_csv(source,url,output_dir,station_id)\n",
+    "                    print(f\"Extracted: {produkt_file}\")\n",
+    "                    break #TODO remove, this is for testing\n",
+    "                else:\n",
+    "                    print(f\"No 'produkt_' file found in {zip_link}\")\n",
+    "        else:\n",
+    "            print(f\"Skipping {zip_link} as it's not in the filtered station list\")\n",
+    "\n",
+    "def process_csv(source,url,output_dir,station_id):\n",
+    "    #Cases data loading \n",
+    "    if url.split(\"/\")[-3] == \"air_temperature\":\n",
+    "        usecols = [1,3,4]\n",
+    "    #TODO elif for the other directories. Some contain more than one feature. air_temperature for example has the air_temperature AND the air humidity\n",
+    "    df = pd.read_csv(source, delimiter=';', encoding='utf-8',usecols=usecols,index_col=0,parse_dates=True,date_format=\"%Y%m%d%H\")\n",
+    "\n",
+    "    #TODO i would suggest filtering the data to values in the last 20 years at most here\n",
+    "\n",
+    "    #cases saving\n",
+    "    if url.split(\"/\")[-3] == \"air_temperature\":\n",
+    "        (output_dir / \"air_temperature\").mkdir(exist_ok=True)\n",
+    "        (output_dir / \"air_humidity\").mkdir(exist_ok=True)\n",
+    "        df[[\"TT_TU\"]].to_csv(output_dir / \"air_temperature\" / f\"{station_id}.csv\")\n",
+    "        df[[\"RF_TU\"]].to_csv(output_dir / \"air_humidity\" / f\"{station_id}.csv\")\n",
+    "    #TODO elif for the other directories. Some contain more than one feature. air_temperature for example has the air_temperature AND the air humidity\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "# Usage\n",
+    "output_dir = Path(\"data/climate_data\")\n",
+    "\n",
+    "# Read and filter stations\n",
+    "#TODO choose which ones you want.\n",
+    "potential_sensors = [\"air_temperature\",\"cloud_type\",\"cloudiness\",\"dew_point\",\"global_radiation\",\"precipitation\",\"pressure\",\"soil_temperature\",\"solar\",\"sun\",\"visibility\",\"wind\"]\n",
+    "\n",
+    "for dir_part in potential_sensors:\n",
+    "    url = \"https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/hourly/\"+dir_part+\"/historical/\"\n",
+    "    print(f\"Processing {dir_part}\")\n",
+    "\n",
+    "    df_filtered_stations = read_and_filter_stations(url)\n",
+    "    #print(f\"Number of stations with data in the last 5 years: {len(df_filtered_stations)}\")\n",
+    "    download_and_extract_data(url, output_dir, df_filtered_stations)\n"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 105,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
+    "output_dir"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with zipfile.ZipFile(io.BytesIO(zip_response.content)) as zip_ref:\n",
+    "    with zip_ref.open(produkt_file) as source:\n",
+    "        df2 = process_csv(source,url,output_dir,station_id)\n",
+    "df2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# Download and process data for filtered stations\n",
+    "download_and_extract_data(url, output_dir, filtered_stations)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.to_datetime(df['von_datum'], format='%Y%m%d')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_filtered"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dwd_base_url = \"https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/hourly/\"\n",
+    "sensor_types = [\"air_temperature\",\"dew_point\",\"moisture\",\"precipitation\",\"pressure\",\"sun\",\"wind\"]\n",
+    "\n",
     "start_list = []\n",
     "\n",
     "for sensor_type in sensor_types:\n",
@@ -100,6 +297,13 @@
     "fig.savefig(\"dwd_data_availability.png\")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -559,6 +763,440 @@
     "Counter([x[\"DataUnit\"] for station in stations_by_region for x in station[\"tslist\"]])\n"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_135949/3569064465.py:1: DeprecationWarning: \n",
+      "Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),\n",
+      "(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)\n",
+      "but was not found to be installed on your system.\n",
+      "If this would cause problems for you,\n",
+      "please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466\n",
+      "        \n",
+      "  import pandas as pd\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from os.path import dirname, abspath\n",
+    "import statistics\n",
+    "from torch.utils.data import Dataset\n",
+    "from sklearn.manifold import TSNE\n",
+    "from sklearn.decomposition import PCA\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import plotly.graph_objects as go\n",
+    "from plotly.subplots import make_subplots\n",
+    "\n",
+    "def fill_missing_values(df: pd.DataFrame, max_fill=10) -> pd.DataFrame:\n",
+    "    \"\"\"\n",
+    "    Fills values in a DataFrame.\n",
+    "    First columns recognized as Precipitation columns (containing NEW or NVh) are filled with 0,\n",
+    "    other columns are filled linear with a limit of 24, with warning if more than 5 continuos values are missing\n",
+    "\n",
+    "    Args:\n",
+    "        df : DataFrame that might be missing values\n",
+    "        max_fill : How many continuosly missing values are allowed\n",
+    "    Returns:\n",
+    "        DataFrame: Filled DataFrame (if possible)\n",
+    "    \"\"\"\n",
+    "    old_size = df.shape[0]\n",
+    "\n",
+    "    df = df.resample(\"h\").mean()  # TODO maybe not 'h' for other datasets?\n",
+    "    na_count = df.isna().sum(axis=0)\n",
+    "\n",
+    "    # get all columns with precipitation and fill missing values with 0\n",
+    "    mask = df.columns.str.contains(\"NEW\") | df.columns.str.contains(\"NVh\")\n",
+    "\n",
+    "    prec_cols = list(na_count[mask][na_count[mask] > 0].index)\n",
+    "    if len(prec_cols) > 0:\n",
+    "        df.loc[:, mask] = df.loc[:, mask].fillna(0)\n",
+    "\n",
+    "    # interpolate data in all other columns\n",
+    "    df = df.interpolate(limit=max_fill, limit_direction=\"both\")\n",
+    "\n",
+    "    if df.isna().sum().sum() > 0:\n",
+    "        raise ValueError(\n",
+    "            f\"Some columns were missing more than {max_fill} continuous values, either raise the limit or fill values manually.\"\n",
+    "            + f\"{df.isna().sum().sum()} still missing, maybe due to {len(df)-old_size} missing timestamps?\"\n",
+    "        )\n",
+    "    return df\n",
+    "\n",
+    "class TimeSeriesDataSetGAN(Dataset):\n",
+    "    \"\"\"\n",
+    "    Custom trivial Dataset, just takes a rolling window of X with length in_size and y with length out_size (shifted by in_size)\n",
+    "\n",
+    "    Args:\n",
+    "        X (Tensor): Input values\n",
+    "        y (Tensor): Target values\n",
+    "        in_size (int): input length in hours\n",
+    "        out_size (int): output length in hours\n",
+    "    \"\"\"\n",
+    "\n",
+    "    def __init__(self, X, seq_len) -> None:\n",
+    "        self.X = X\n",
+    "        self.seq_len = seq_len\n",
+    "\n",
+    "    def __len__(self) -> int:\n",
+    "        return len(self.X) - self.seq_len\n",
+    "\n",
+    "    def __getitem__(self, idx):\n",
+    "        if idx >= len(self):\n",
+    "            raise IndexError(\n",
+    "                f\"Index {idx} is out of range, dataset has length {len(self)}\"\n",
+    "            )\n",
+    "        return self.X[idx : idx + self.seq_len].values\n",
+    "\n",
+    "def plot_tsne_plotly(real_tsne_results: np.ndarray, fake_tsne_results: np.ndarray, window_size: int):\n",
+    "    \"\"\"Plot t-SNE results using Plotly.\"\"\"\n",
+    "    fig = go.Figure()\n",
+    "\n",
+    "    # Add t-SNE scatter plot\n",
+    "    fig.add_trace(\n",
+    "        go.Scatter(\n",
+    "            x=real_tsne_results[:, 0],\n",
+    "            y=real_tsne_results[:, 1],\n",
+    "            mode='markers',\n",
+    "            marker=dict(\n",
+    "                size=5,\n",
+    "                color=\"blue\"\n",
+    "            ),\n",
+    "            name='Real Data'\n",
+    "        )\n",
+    "    )\n",
+    "    fig.add_trace(\n",
+    "        go.Scatter(\n",
+    "            x=fake_tsne_results[:, 0],\n",
+    "            y=fake_tsne_results[:, 1],\n",
+    "            mode='markers',\n",
+    "            marker=dict(\n",
+    "                size=5,\n",
+    "                color=\"red\"\n",
+    "            ),\n",
+    "            name='Fake Data'\n",
+    "        )\n",
+    "    )\n",
+    "\n",
+    "    # Update layout\n",
+    "    fig.update_layout(\n",
+    "        title=f't-SNE of Multivariate Time Series (Window Size: {window_size})',\n",
+    "        height=900,\n",
+    "        hovermode='closest'\n",
+    "    )\n",
+    "\n",
+    "    # Update axes\n",
+    "    fig.update_xaxes(title_text='t-SNE 1')\n",
+    "    fig.update_yaxes(title_text='t-SNE 2')\n",
+    "    \n",
+    "    fig.show()\n",
+    "    \n",
+    "def my_load_data_gan(\n",
+    "    data_path,\n",
+    "    filter=None,\n",
+    "):\n",
+    "    df = pd.read_csv(data_path, index_col=0, parse_dates=True)\n",
+    "    df = fill_missing_values(df)\n",
+    "\n",
+    "    # TODO spaltenname generieren\n",
+    "    dataset = TimeSeriesDataSetGAN(df, 144 + 48)\n",
+    "\n",
+    "    return dataset\n",
+    "    # TODO 2. Klasse bauen die filtert\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds = my_load_data_gan(\"../KIWaVo/data/input/Hollingstedt8.csv\")\n",
+    "real_data = np.array([ds[i] for i in range(len(ds))])\n",
+    "real_data2 = real_data.reshape(real_data.shape[0],-1)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "num_samples=10000\n",
+    "tsne = TSNE(n_components=2, random_state=42)\n",
+    "real_data_tsne = tsne.fit_transform(real_data2[:num_samples])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_tsne_plotly(real_data_tsne,real_data_tsne,ds.seq_len)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv(\"../KIWaVo/data/input/Hollingstedt8.csv\", index_col=0, parse_dates=True)\n",
+    "df = fill_missing_values(df)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch \n",
+    "hours = df.index.hour\n",
+    "hour_rad = torch.tensor(2*torch.pi*hours/24)\n",
+    "df[\"h_sin\"] = torch.sin(hour_rad)\n",
+    "df[\"h_cos\"] = torch.cos(hour_rad)\n",
+    "\n",
+    "days = df.index.day_of_year + hours / 24\n",
+    "days_rad = torch.tensor(2*torch.pi*days/365)\n",
+    "df[\"d_sin\"] = torch.sin(days_rad)\n",
+    "df[\"d_cos\"] = torch.cos(days_rad)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<Axes: xlabel='tstamp'>"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "image/png": "",
+      "text/plain": [
+       "<Figure size 640x480 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "df[[\"h_sin\",\"h_cos\",\"d_sin\",\"d_cos\"]][:1000].plot()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 147,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "windows[0].shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 128,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "anal_sample_no = 1000\n",
+    "seq_len = ds.seq_len\n",
+    "for i in range(anal_sample_no):\n",
+    "    if (i == 0):\n",
+    "        prep_data = np.reshape(np.mean(real_data[0,:,:], 1), [1,seq_len])\n",
+    "        #prep_data_hat = np.reshape(np.mean(generated_data[0,:,:],1), [1,seq_len])\n",
+    "    else:\n",
+    "        prep_data = np.concatenate((prep_data, \n",
+    "                                    np.reshape(np.mean(real_data[i,:,:],1), [1,seq_len])))\n",
+    "        #prep_data_hat = np.concatenate((prep_data_hat, \n",
+    "        #                                np.reshape(np.mean(generated_data[i,:,:],1), [1,seq_len])))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prep_data.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"Time-series Generative Adversarial Networks (TimeGAN) Codebase.\n",
+    "\n",
+    "Reference: Jinsung Yoon, Daniel Jarrett, Mihaela van der Schaar, \n",
+    "\"Time-series Generative Adversarial Networks,\" \n",
+    "Neural Information Processing Systems (NeurIPS), 2019.\n",
+    "\n",
+    "Paper link: https://papers.nips.cc/paper/8789-time-series-generative-adversarial-networks\n",
+    "\n",
+    "Last updated Date: April 24th 2020\n",
+    "Code author: Jinsung Yoon (jsyoon0823@gmail.com)\n",
+    "\n",
+    "-----------------------------\n",
+    "\n",
+    "visualization_metrics.py\n",
+    "\n",
+    "Note: Use PCA or tSNE for generated and original data visualization\n",
+    "\"\"\"\n",
+    "\n",
+    "# Necessary packages\n",
+    "from sklearn.manifold import TSNE\n",
+    "from sklearn.decomposition import PCA\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "\n",
+    "   \n",
+    "def visualization (ori_data, generated_data, analysis):\n",
+    "  \"\"\"Using PCA or tSNE for generated and original data visualization.\n",
+    "  \n",
+    "  Args:\n",
+    "    - ori_data: original data\n",
+    "    - generated_data: generated synthetic data\n",
+    "    - analysis: tsne or pca\n",
+    "  \"\"\"  \n",
+    "  # Analysis sample size (for faster computation)\n",
+    "  anal_sample_no = min([1000, len(ori_data)])\n",
+    "  idx = np.random.permutation(len(ori_data))[:anal_sample_no]\n",
+    "    \n",
+    "  # Data preprocessing\n",
+    "  ori_data = np.asarray(ori_data)\n",
+    "  generated_data = np.asarray(generated_data)  \n",
+    "  \n",
+    "  ori_data = ori_data[idx]\n",
+    "  generated_data = generated_data[idx]\n",
+    "  \n",
+    "  no, seq_len, dim = ori_data.shape  \n",
+    "  \n",
+    "  for i in range(anal_sample_no):\n",
+    "    if (i == 0):\n",
+    "      prep_data = np.reshape(np.mean(ori_data[0,:,:], 1), [1,seq_len])\n",
+    "      prep_data_hat = np.reshape(np.mean(generated_data[0,:,:],1), [1,seq_len])\n",
+    "    else:\n",
+    "      prep_data = np.concatenate((prep_data, \n",
+    "                                  np.reshape(np.mean(ori_data[i,:,:],1), [1,seq_len])))\n",
+    "      prep_data_hat = np.concatenate((prep_data_hat, \n",
+    "                                      np.reshape(np.mean(generated_data[i,:,:],1), [1,seq_len])))\n",
+    "    \n",
+    "  # Visualization parameter        \n",
+    "  colors = [\"red\" for i in range(anal_sample_no)] + [\"blue\" for i in range(anal_sample_no)]    \n",
+    "    \n",
+    "  if analysis == 'pca':\n",
+    "    # PCA Analysis\n",
+    "    pca = PCA(n_components = 2)\n",
+    "    pca.fit(prep_data)\n",
+    "    pca_results = pca.transform(prep_data)\n",
+    "    pca_hat_results = pca.transform(prep_data_hat)\n",
+    "    \n",
+    "    # Plotting\n",
+    "    f, ax = plt.subplots(1)    \n",
+    "    plt.scatter(pca_results[:,0], pca_results[:,1],\n",
+    "                c = colors[:anal_sample_no], alpha = 0.2, label = \"Original\")\n",
+    "    plt.scatter(pca_hat_results[:,0], pca_hat_results[:,1], \n",
+    "                c = colors[anal_sample_no:], alpha = 0.2, label = \"Synthetic\")\n",
+    "  \n",
+    "    ax.legend()  \n",
+    "    plt.title('PCA plot')\n",
+    "    plt.xlabel('x-pca')\n",
+    "    plt.ylabel('y_pca')\n",
+    "    plt.show()\n",
+    "    \n",
+    "  elif analysis == 'tsne':\n",
+    "    \n",
+    "    # Do t-SNE Analysis together       \n",
+    "    prep_data_final = np.concatenate((prep_data, prep_data_hat), axis = 0)\n",
+    "    \n",
+    "    # TSNE anlaysis\n",
+    "    tsne = TSNE(n_components = 2, verbose = 1, perplexity = 40, n_iter = 300)\n",
+    "    tsne_results = tsne.fit_transform(prep_data_final)\n",
+    "      \n",
+    "    # Plotting\n",
+    "    f, ax = plt.subplots(1)\n",
+    "      \n",
+    "    plt.scatter(tsne_results[:anal_sample_no,0], tsne_results[:anal_sample_no,1], \n",
+    "                c = colors[:anal_sample_no], alpha = 0.2, label = \"Original\")\n",
+    "    plt.scatter(tsne_results[anal_sample_no:,0], tsne_results[anal_sample_no:,1], \n",
+    "                c = colors[anal_sample_no:], alpha = 0.2, label = \"Synthetic\")\n",
+    "  \n",
+    "    ax.legend()\n",
+    "      \n",
+    "    plt.title('t-SNE plot')\n",
+    "    plt.xlabel('x-tsne')\n",
+    "    plt.ylabel('y_tsne')\n",
+    "    plt.show()    "
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/src/main_hyper.py b/src/main_hyper.py
index 03a514d7553324e37fc2ee7ed634a32ee230388e..015ff821af0fe3125a2c4708b725411e42e944fe 100644
--- a/src/main_hyper.py
+++ b/src/main_hyper.py
@@ -140,11 +140,11 @@ class Objective:
 
     def _get_model_params(self,trial : optuna.Trial):
         #differencing = 0
-        differencing = trial.suggest_int("differencing", 0, 1)
+        differencing = trial.suggest_int("differencing", 1, 1)
         learning_rate = trial.suggest_float("lr", 0.00001, 0.01)
-        # optimizer = trial.suggest_categorical("optimizer", ["adam","adamw"])
+        # optimizer = trial.suggest_categorical("optimizer", ["adam","adamw"])  
         optimizer = trial.suggest_categorical("optimizer", ["adam"])
-        embed_time = trial.suggest_categorical("embed_time", [True,False])
+        embed_time = trial.suggest_categorical("embed_time", [False])
         #model_architecture = trial.suggest_categorical("model_architecture", ["classic_lstm"])
         #model_architecture = trial.suggest_categorical("model_architecture", ["transformer","autoformer"]) #TODO add models here
         #model_architecture = trial.suggest_categorical("model_architecture", ["autoformer"])
@@ -174,7 +174,7 @@ class Objective:
         if model_architecture in ["classic_lstm", "last_lstm"]:
             model_params = dict(
                 hidden_size_lstm=trial.suggest_int("hidden_size_lstm", 32, 512),
-                num_layers_lstm=trial.suggest_int("n_layers_lstm", 1, 3),
+                num_layers_lstm=trial.suggest_int("n_layers_lstm", 1, 2),#3
                 # num_layers_lstm = trial.suggest_int("n_layers_lstm", 1, 1),
                 hidden_size=trial.suggest_int("hidden_size", 32, 512),
                 num_layers=trial.suggest_int("n_layers", 2, 4),
diff --git a/src/utils/utility.py b/src/utils/utility.py
index 71e2ea1123674cbde055bdeb71b42f276c94dd21..954e1253f69577c17d0cd8b1e571e6140f752eff 100644
--- a/src/utils/utility.py
+++ b/src/utils/utility.py
@@ -8,7 +8,7 @@ import torch
 import pandas as pd
 from utils.timefeatures import time_features
 
-def fill_missing_values(df: pd.DataFrame, max_fill=10) -> pd.DataFrame:
+def fill_missing_values(df: pd.DataFrame, max_fill=200) -> pd.DataFrame:
     """
     Fills values in a DataFrame.
     First columns recognized as Precipitation columns (containing NEW or NVh) are filled with 0,
@@ -26,7 +26,7 @@ def fill_missing_values(df: pd.DataFrame, max_fill=10) -> pd.DataFrame:
     na_count = df.isna().sum(axis=0)
 
     # get all columns with precipitation and fill missing values with 0
-    mask = df.columns.str.contains('NEW') | df.columns.str.contains('NVh')
+    mask = df.columns.str.contains('NEW') | df.columns.str.contains('NVh') | (df.columns.str.startswith('N') & df.columns.str.endswith('_mm'))
 
     prec_cols = list(na_count[mask][na_count[mask] > 0].index)
     if len(prec_cols) > 0: