diff --git a/.gitignore b/.gitignore index 44131ac..9cd5716 100755 --- a/.gitignore +++ b/.gitignore @@ -162,4 +162,4 @@ ciccada/ *.csv -*.png \ No newline at end of file +*.png diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml old mode 100755 new mode 100644 index 06e8e6c..9e4a3d6 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,7 +2,7 @@ # See https://pre-commit.com/hooks.html for more hooks repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v5.0.0 + rev: v6.0.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer @@ -11,7 +11,7 @@ repos: exclude: ^uv.lock$ - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: v0.8.2 + rev: v0.15.14 hooks: # Run the linter. - id: ruff diff --git a/BOM_NCI/Get_ALL_postcodes_ABS.ipynb b/BOM_NCI/Get_ALL_postcodes_ABS.ipynb index e1bffd6..559cbf3 100755 --- a/BOM_NCI/Get_ALL_postcodes_ABS.ipynb +++ b/BOM_NCI/Get_ALL_postcodes_ABS.ipynb @@ -8,12 +8,14 @@ "outputs": [], "source": [ "import sys\n", - "sys.path.append('../') \n", - "from visualisation import *\n", - "import xarray as xr\n", + "\n", + "sys.path.append(\"../\")\n", "import dask\n", "import geopandas as gpd\n", + "import xarray as xr\n", "from shapely.geometry import Point\n", + "\n", + "from visualisation import *\n", "# crs = EPSG:4326 (WGS 84)" ] }, @@ -36,7 +38,7 @@ ], "source": [ "bom_path = \"/home/hossein/CICCADA/BOM_NCI/2023/01/01/\"\n", - "files = glob(bom_path+\"*.nc\")\n", + "files = glob(bom_path + \"*.nc\")\n", "len(files)" ] }, @@ -57,13 +59,13 @@ "source": [ "df = [xr.open_dataset(file).to_dataframe() for file in files[:15]]\n", "df = pd.concat(df, axis=0).reset_index(drop=False)\n", - "df = df.dropna(subset='direct_normal_irradiance').reset_index(drop=True)\n", - "df['julian_date'] = pd.to_datetime(df['julian_date'], origin='julian', unit='D')\n", - "df = df[['latitude', 'longitude']].drop_duplicates().reset_index(drop=True)\n", + "df = df.dropna(subset=\"direct_normal_irradiance\").reset_index(drop=True)\n", + "df[\"julian_date\"] = pd.to_datetime(df[\"julian_date\"], origin=\"julian\", unit=\"D\")\n", + "df = df[[\"latitude\", \"longitude\"]].drop_duplicates().reset_index(drop=True)\n", "# df = df.query(f\"latitude >= -35 & latitude <= -34.6 & longitude >= 138.5 & longitude <= 138.8\").reset_index(drop=True)\n", - "df['geometry'] = [Point(x,y) for x,y in zip(df['longitude'], df['latitude'])]\n", - "geo_list = df['geometry'].unique()\n", - "print('len(geo_list): ', len(geo_list))" + "df[\"geometry\"] = [Point(x, y) for x, y in zip(df[\"longitude\"], df[\"latitude\"])]\n", + "geo_list = df[\"geometry\"].unique()\n", + "print(\"len(geo_list): \", len(geo_list))" ] }, { @@ -73,7 +75,9 @@ "metadata": {}, "outputs": [], "source": [ - "gdf = gpd.GeoDataFrame(df[['longitude', 'latitude', 'geometry']], geometry='geometry', crs='EPSG:4326') # assuming WGS84" + "gdf = gpd.GeoDataFrame(\n", + " df[[\"longitude\", \"latitude\", \"geometry\"]], geometry=\"geometry\", crs=\"EPSG:4326\"\n", + ") # assuming WGS84" ] }, { @@ -83,8 +87,8 @@ "metadata": {}, "outputs": [], "source": [ - "gdf_postcodes = gpd.read_file('POA_2021_AUST_GDA2020_SHP/POA_2021_AUST_GDA2020.shp')\n", - "gdf_postcodes = gdf_postcodes.to_crs('EPSG:4326') # Ensure same CRS" + "gdf_postcodes = gpd.read_file(\"POA_2021_AUST_GDA2020_SHP/POA_2021_AUST_GDA2020.shp\")\n", + "gdf_postcodes = gdf_postcodes.to_crs(\"EPSG:4326\") # Ensure same CRS" ] }, { @@ -153,10 +157,7 @@ "outputs": [], "source": [ "gdf_joined = gpd.sjoin(\n", - " gdf,\n", - " gdf_postcodes[['POA_CODE21', 'geometry']],\n", - " how='left',\n", - " predicate='within'\n", + " gdf, gdf_postcodes[[\"POA_CODE21\", \"geometry\"]], how=\"left\", predicate=\"within\"\n", ")" ] }, @@ -167,7 +168,7 @@ "metadata": {}, "outputs": [], "source": [ - "gdf_joined.drop(columns=['index_right'], inplace=True)\n", + "gdf_joined.drop(columns=[\"index_right\"], inplace=True)\n", "gdf_joined = gdf_joined.dropna().reset_index(drop=True)" ] }, @@ -178,7 +179,7 @@ "metadata": {}, "outputs": [], "source": [ - "gdf_joined.to_csv('bom_postcodes_points.csv', index=False)" + "gdf_joined.to_csv(\"bom_postcodes_points.csv\", index=False)" ] }, { @@ -188,7 +189,7 @@ "metadata": {}, "outputs": [], "source": [ - "gdf_postcodes['geometry'][0]" + "gdf_postcodes[\"geometry\"][0]" ] }, { @@ -265,7 +266,7 @@ ], "source": [ "fig, ax = plt.subplots()\n", - "gdf_postcodes.plot(ax=ax, facecolor='none', edgecolor='black')\n" + "gdf_postcodes.plot(ax=ax, facecolor=\"none\", edgecolor=\"black\")" ] }, { @@ -275,7 +276,7 @@ "metadata": {}, "outputs": [], "source": [ - "gdf.plot(ax=ax, color='red', markersize=2)\n", + "gdf.plot(ax=ax, color=\"red\", markersize=2)\n", "plt.show()" ] } diff --git a/BOM_NCI/Get_ALL_postcodes_GNAF.ipynb b/BOM_NCI/Get_ALL_postcodes_GNAF.ipynb index e97f1b9..0b6373d 100755 --- a/BOM_NCI/Get_ALL_postcodes_GNAF.ipynb +++ b/BOM_NCI/Get_ALL_postcodes_GNAF.ipynb @@ -8,13 +8,15 @@ "outputs": [], "source": [ "import sys\n", - "sys.path.append('../') \n", - "from visualisation import *\n", - "import xarray as xr\n", + "\n", + "sys.path.append(\"../\")\n", "import dask\n", "import geopandas as gpd\n", + "import xarray as xr\n", "from shapely.geometry import Point\n", "from sklearn.neighbors import KDTree\n", + "\n", + "from visualisation import *\n", "# crs = EPSG:4326 (WGS 84)" ] }, @@ -37,7 +39,7 @@ ], "source": [ "bom_path = \"/home/hossein/CICCADA/BOM_NCI/2023/01/01/\"\n", - "files = glob(bom_path+\"*.nc\")\n", + "files = glob(bom_path + \"*.nc\")\n", "len(files)" ] }, @@ -50,9 +52,9 @@ "source": [ "df = [xr.open_dataset(file).to_dataframe() for file in files[:15]]\n", "df = pd.concat(df, axis=0).reset_index(drop=False)\n", - "df = df.dropna(subset='direct_normal_irradiance').reset_index(drop=True)\n", - "df['julian_date'] = pd.to_datetime(df['julian_date'], origin='julian', unit='D')\n", - "df = df[['latitude', 'longitude']].drop_duplicates().reset_index(drop=True)" + "df = df.dropna(subset=\"direct_normal_irradiance\").reset_index(drop=True)\n", + "df[\"julian_date\"] = pd.to_datetime(df[\"julian_date\"], origin=\"julian\", unit=\"D\")\n", + "df = df[[\"latitude\", \"longitude\"]].drop_duplicates().reset_index(drop=True)" ] }, { @@ -113,9 +115,13 @@ "metadata": {}, "outputs": [], "source": [ - "SA_STREET_LOCALITY_POINT_psv = pd.read_csv(glob(f\"{naf_path}SA_STREET_LOCALITY_POINT_psv.psv\")[0], sep='|', low_memory=False).dropna(axis=1)\n", - "SA_ADDRESS_DETAIL_psv = pd.read_csv(glob(f\"{naf_path}SA_ADDRESS_DETAIL_psv.psv\")[0], sep='|', low_memory=False).dropna(axis=1)\n", - "# SA_ADDRESS_DETAIL_psv\n" + "SA_STREET_LOCALITY_POINT_psv = pd.read_csv(\n", + " glob(f\"{naf_path}SA_STREET_LOCALITY_POINT_psv.psv\")[0], sep=\"|\", low_memory=False\n", + ").dropna(axis=1)\n", + "SA_ADDRESS_DETAIL_psv = pd.read_csv(\n", + " glob(f\"{naf_path}SA_ADDRESS_DETAIL_psv.psv\")[0], sep=\"|\", low_memory=False\n", + ").dropna(axis=1)\n", + "# SA_ADDRESS_DETAIL_psv" ] }, { @@ -292,7 +298,9 @@ "metadata": {}, "outputs": [], "source": [ - "a = pd.read_csv(glob(f\"{naf_path}SA_ADDRESS_DETAIL_psv.psv\")[0], sep='|', low_memory=False)" + "a = pd.read_csv(\n", + " glob(f\"{naf_path}SA_ADDRESS_DETAIL_psv.psv\")[0], sep=\"|\", low_memory=False\n", + ")" ] }, { @@ -334,7 +342,7 @@ "metadata": {}, "outputs": [], "source": [ - "5035 in SA_ADDRESS_DETAIL_psv['POSTCODE'].unique()" + "5035 in SA_ADDRESS_DETAIL_psv[\"POSTCODE\"].unique()" ] }, { @@ -344,7 +352,7 @@ "metadata": {}, "outputs": [], "source": [ - "SA_ADDRESS_DETAIL_psv['POSTCODE'].unique().shape" + "SA_ADDRESS_DETAIL_psv[\"POSTCODE\"].unique().shape" ] }, { @@ -365,8 +373,14 @@ } ], "source": [ - "locaility_points = SA_STREET_LOCALITY_POINT_psv[['STREET_LOCALITY_PID', 'LONGITUDE', 'LATITUDE']].merge(SA_ADDRESS_DETAIL_psv[['STREET_LOCALITY_PID', 'POSTCODE']].drop_duplicates(), on='STREET_LOCALITY_PID', how='left')\n", - "locaility_points.drop(columns=['STREET_LOCALITY_PID'], inplace=True)\n", + "locaility_points = SA_STREET_LOCALITY_POINT_psv[\n", + " [\"STREET_LOCALITY_PID\", \"LONGITUDE\", \"LATITUDE\"]\n", + "].merge(\n", + " SA_ADDRESS_DETAIL_psv[[\"STREET_LOCALITY_PID\", \"POSTCODE\"]].drop_duplicates(),\n", + " on=\"STREET_LOCALITY_PID\",\n", + " how=\"left\",\n", + ")\n", + "locaility_points.drop(columns=[\"STREET_LOCALITY_PID\"], inplace=True)\n", "locaility_points.dropna(inplace=True)\n", "locaility_points.columns" ] @@ -388,8 +402,8 @@ "metadata": {}, "outputs": [], "source": [ - "postcode_coords = locaility_points[['LATITUDE', 'LONGITUDE']].to_numpy()\n", - "kdtree = KDTree(postcode_coords, metric='euclidean')" + "postcode_coords = locaility_points[[\"LATITUDE\", \"LONGITUDE\"]].to_numpy()\n", + "kdtree = KDTree(postcode_coords, metric=\"euclidean\")" ] }, { @@ -414,9 +428,11 @@ "metadata": {}, "outputs": [], "source": [ - "df['nearest_postcode'] = locaility_points.iloc[nearest_indices]['POSTCODE'].values\n", + "df[\"nearest_postcode\"] = locaility_points.iloc[nearest_indices][\"POSTCODE\"].values\n", "\n", - "df['distance_km'] = nearest_distances*111 # Rough conversion factor for degrees to kilometers" + "df[\"distance_km\"] = (\n", + " nearest_distances * 111\n", + ") # Rough conversion factor for degrees to kilometers" ] }, { @@ -447,7 +463,7 @@ } ], "source": [ - "df0['nearest_postcode'].unique().shape" + "df0[\"nearest_postcode\"].unique().shape" ] }, { @@ -468,7 +484,7 @@ } ], "source": [ - "5035 in df0['nearest_postcode'].unique()" + "5035 in df0[\"nearest_postcode\"].unique()" ] }, { @@ -478,7 +494,7 @@ "metadata": {}, "outputs": [], "source": [ - "df0.to_csv('bom_postcodes_points.csv', index=False)" + "df0.to_csv(\"bom_postcodes_points.csv\", index=False)" ] } ], diff --git a/BOM_NCI/describe_bom_data.ipynb b/BOM_NCI/describe_bom_data.ipynb index 97fbfde..8887989 100755 --- a/BOM_NCI/describe_bom_data.ipynb +++ b/BOM_NCI/describe_bom_data.ipynb @@ -7,10 +7,13 @@ "outputs": [], "source": [ "import sys\n", - "sys.path.append('../') \n", - "from visualisation import *\n", + "\n", + "sys.path.append(\"../\")\n", + "import concurrent.futures\n", + "\n", "import xarray as xr\n", - "import concurrent.futures" + "\n", + "from visualisation import *" ] }, { @@ -31,7 +34,7 @@ ], "source": [ "bom_path = \"/home/hossein/CICCADA/BOM_NCI/2023/01/01/\"\n", - "files = glob(bom_path+\"*.nc\")\n", + "files = glob(bom_path + \"*.nc\")\n", "len(files)" ] }, @@ -47,7 +50,7 @@ "# print(ds)\n", "\n", "# List all variables\n", - "# print(ds.variables)\n" + "# print(ds.variables)" ] }, { @@ -64,7 +67,7 @@ } ], "source": [ - "print(ds['latitude'].attrs)\n" + "print(ds[\"latitude\"].attrs)" ] }, { @@ -201,12 +204,12 @@ } ], "source": [ - "for key in ('quality_mask','cloud_type'):\n", + "for key in (\"quality_mask\", \"cloud_type\"):\n", " print(ds[key].long_name)\n", " print(ds[key].flag_meanings)\n", " print(ds[key].flag_values)\n", " print(ds[key].comment)\n", - " print('---------------------------------')" + " print(\"---------------------------------\")" ] } ], diff --git a/BOM_NCI/get_postcode_SA.ipynb b/BOM_NCI/get_postcode_SA.ipynb index 30d1712..16ea5a3 100755 --- a/BOM_NCI/get_postcode_SA.ipynb +++ b/BOM_NCI/get_postcode_SA.ipynb @@ -7,15 +7,19 @@ "outputs": [], "source": [ "import sys\n", - "sys.path.append('../') \n", - "from visualisation import *\n", - "import xarray as xr\n", - "import dask, re\n", + "\n", + "sys.path.append(\"../\")\n", + "import re\n", + "\n", + "import dask\n", "import geopandas as gpd\n", - "from shapely.geometry import Point\n", + "import xarray as xr\n", + "from geopy.exc import GeocoderTimedOut\n", "from geopy.geocoders import Nominatim\n", + "from shapely.geometry import Point\n", "from tqdm import tqdm # For progress bar in large datasets\n", - "from geopy.exc import GeocoderTimedOut" + "\n", + "from visualisation import *" ] }, { @@ -36,7 +40,7 @@ ], "source": [ "bom_path = \"/home/hossein/CICCADA/BOM_NCI/2023/01/01/\"\n", - "files = glob(bom_path+\"*.nc\")\n", + "files = glob(bom_path + \"*.nc\")\n", "len(files)" ] }, @@ -56,12 +60,14 @@ "source": [ "df = [xr.open_dataset(file).to_dataframe() for file in files[:15]]\n", "df = pd.concat(df, axis=0).reset_index(drop=False)\n", - "df = df.dropna(subset='direct_normal_irradiance').reset_index(drop=True)\n", - "df['julian_date'] = pd.to_datetime(df['julian_date'], origin='julian', unit='D')\n", - "df = df.query(f\"latitude >= -35 & latitude <= -34.6 & longitude >= 138.5 & longitude <= 138.8\").reset_index(drop=True)\n", - "df['geometry'] = [Point(x,y) for x,y in zip(df['longitude'], df['latitude'])]\n", - "geo_list = df['geometry'].unique()\n", - "print('len(geo_list): ', len(geo_list))" + "df = df.dropna(subset=\"direct_normal_irradiance\").reset_index(drop=True)\n", + "df[\"julian_date\"] = pd.to_datetime(df[\"julian_date\"], origin=\"julian\", unit=\"D\")\n", + "df = df.query(\n", + " f\"latitude >= -35 & latitude <= -34.6 & longitude >= 138.5 & longitude <= 138.8\"\n", + ").reset_index(drop=True)\n", + "df[\"geometry\"] = [Point(x, y) for x, y in zip(df[\"longitude\"], df[\"latitude\"])]\n", + "geo_list = df[\"geometry\"].unique()\n", + "print(\"len(geo_list): \", len(geo_list))" ] }, { @@ -739,14 +745,14 @@ "for _ in range(len(geo_list)):\n", " try:\n", " address = gpd.tools.reverse_geocode(geo_list[i], timeout=60)\n", - " if address['address'].to_list()[0]!=None:\n", - " print(address['address'], i)\n", + " if address[\"address\"].to_list()[0] != None:\n", + " print(address[\"address\"], i)\n", " address_i.append(address)\n", " i_i.append(i)\n", " except Exception as e:\n", " pass\n", " print(e, i)\n", - " i += 1\n" + " i += 1" ] }, { @@ -766,7 +772,7 @@ } ], "source": [ - "postcode = re.search(r'\\b\\d{4}\\b', address_i[0]['address'].to_list()[0])\n", + "postcode = re.search(r\"\\b\\d{4}\\b\", address_i[0][\"address\"].to_list()[0])\n", "postcode.group()" ] }, @@ -788,7 +794,7 @@ } ], "source": [ - "i_i[0], address_i[0]['address'].to_list()[0]" + "i_i[0], address_i[0][\"address\"].to_list()[0]" ] }, { @@ -817,11 +823,10 @@ "metadata": {}, "outputs": [], "source": [ - "postcode_i = [re.search(r'\\b\\d{4}\\b', i['address'].to_list()[0]) for i in address_i]\n", - "postcode_i = [i.group() if i is not None else 'None' for i in postcode_i]\n", + "postcode_i = [re.search(r\"\\b\\d{4}\\b\", i[\"address\"].to_list()[0]) for i in address_i]\n", + "postcode_i = [i.group() if i is not None else \"None\" for i in postcode_i]\n", "lat_i = [geo_list[i].y for i in i_i]\n", - "lon_i = [geo_list[i].x for i in i_i]\n", - "\n" + "lon_i = [geo_list[i].x for i in i_i]" ] }, { @@ -830,7 +835,9 @@ "metadata": {}, "outputs": [], "source": [ - "df_address = pd.DataFrame({'postcode': postcode_i, 'latitude': lat_i, 'longitude': lon_i})" + "df_address = pd.DataFrame(\n", + " {\"postcode\": postcode_i, \"latitude\": lat_i, \"longitude\": lon_i}\n", + ")" ] }, { @@ -1168,7 +1175,7 @@ } ], "source": [ - "df.merge(df_address, on=['latitude', 'longitude'], how='inner')" + "df.merge(df_address, on=[\"latitude\", \"longitude\"], how=\"inner\")" ] }, { @@ -1177,7 +1184,7 @@ "metadata": {}, "outputs": [], "source": [ - "df_address.to_csv('Adelide_postcode_points.csv', index=False)" + "df_address.to_csv(\"Adelide_postcode_points.csv\", index=False)" ] }, { @@ -1186,7 +1193,7 @@ "metadata": {}, "outputs": [], "source": [ - "df_address = pd.read_csv('Adelide_postcode_points.csv')" + "df_address = pd.read_csv(\"Adelide_postcode_points.csv\")" ] }, { @@ -1628,7 +1635,7 @@ } ], "source": [ - "df[df['latitude']==geo_list[1].y]" + "df[df[\"latitude\"] == geo_list[1].y]" ] }, { @@ -1637,17 +1644,17 @@ "metadata": {}, "outputs": [], "source": [ - "\n", "# Add a reverse geocoding function\n", "def get_postcode(point):\n", " location = geolocator.reverse((point.y, point.x), exactly_one=True)\n", - " if location and 'postcode' in location.raw['address']:\n", - " return location.raw['address']['postcode']\n", + " if location and \"postcode\" in location.raw[\"address\"]:\n", + " return location.raw[\"address\"][\"postcode\"]\n", " return None\n", "\n", + "\n", "# Apply with progress bar\n", "tqdm.pandas()\n", - "gdf['Postcode'] = gdf['geometry'].progress_apply(get_postcode)\n", + "gdf[\"Postcode\"] = gdf[\"geometry\"].progress_apply(get_postcode)\n", "\n", "print(gdf)" ] @@ -1670,7 +1677,7 @@ } ], "source": [ - "np.sort(df0.loc[0, 'julian_date'])" + "np.sort(df0.loc[0, \"julian_date\"])" ] }, { @@ -1694,7 +1701,7 @@ } ], "source": [ - "np.sort(df0.loc[10, 'julian_date'])" + "np.sort(df0.loc[10, \"julian_date\"])" ] }, { @@ -1718,7 +1725,7 @@ } ], "source": [ - "np.sort(df0.loc[22, 'time'])" + "np.sort(df0.loc[22, \"time\"])" ] }, { @@ -1727,12 +1734,24 @@ "metadata": {}, "outputs": [], "source": [ - "\n", "# print(df.columns)\n", "# print(df['julian_date'][0])\n", - "df = df.query('direct_normal_irradiance > 0').drop_duplicates(subset='julian_date').loc[:, ['julian_date', 'direct_normal_irradiance', 'surface_global_irradiance',\n", - " 'surface_diffuse_irradiance', 'cloud_optical_depth', 'cloud_type']]\n", - "print(df['julian_date'].min(), df['julian_date'].max())\n", + "df = (\n", + " df.query(\"direct_normal_irradiance > 0\")\n", + " .drop_duplicates(subset=\"julian_date\")\n", + " .loc[\n", + " :,\n", + " [\n", + " \"julian_date\",\n", + " \"direct_normal_irradiance\",\n", + " \"surface_global_irradiance\",\n", + " \"surface_diffuse_irradiance\",\n", + " \"cloud_optical_depth\",\n", + " \"cloud_type\",\n", + " ],\n", + " ]\n", + ")\n", + "print(df[\"julian_date\"].min(), df[\"julian_date\"].max())\n", "print(df[:30])" ] } diff --git a/BOM_NCI/pre_process.ipynb b/BOM_NCI/pre_process.ipynb index b0f9c17..a8bb5de 100755 --- a/BOM_NCI/pre_process.ipynb +++ b/BOM_NCI/pre_process.ipynb @@ -7,15 +7,17 @@ "outputs": [], "source": [ "import sys\n", - "sys.path.append('../') \n", - "from visualisation import *\n", - "import xarray as xr\n", + "\n", + "sys.path.append(\"../\")\n", "import dask\n", "import geopandas as gpd\n", - "from shapely.geometry import Point\n", + "import xarray as xr\n", + "from geopy.exc import GeocoderTimedOut\n", "from geopy.geocoders import Nominatim\n", + "from shapely.geometry import Point\n", "from tqdm import tqdm # For progress bar in large datasets\n", - "from geopy.exc import GeocoderTimedOut" + "\n", + "from visualisation import *" ] }, { @@ -33,9 +35,11 @@ ], "source": [ "import time\n", + "\n", "from geopy.exc import GeocoderTimedOut\n", "from geopy.geocoders import Nominatim\n", "\n", + "\n", "def geocode_with_retry(address, retries=3):\n", " geolocator = Nominatim(user_agent=\"geoapi\", timeout=30) # Increase timeout\n", " for attempt in range(retries):\n", @@ -48,6 +52,7 @@ " print(\"Failed to geocode after several attempts.\")\n", " return None\n", "\n", + "\n", "address = \"1600 Pennsylvania Ave NW, Washington, DC\"\n", "location = geocode_with_retry(address)\n", "if location:\n", @@ -72,7 +77,7 @@ ], "source": [ "bom_path = \"/mnt/d/bom_nci/2023/01/01/\"\n", - "files = glob(bom_path+\"*.nc\")\n", + "files = glob(bom_path + \"*.nc\")\n", "len(files)" ] }, @@ -84,8 +89,8 @@ "source": [ "df = [xr.open_dataset(file).to_dataframe() for file in files[:15]]\n", "df = pd.concat(df, axis=0).reset_index(drop=False)\n", - "df = df.dropna(subset='direct_normal_irradiance').reset_index(drop=True)\n", - "df['julian_date'] = pd.to_datetime(df['julian_date'], origin='julian', unit='D')\n" + "df = df.dropna(subset=\"direct_normal_irradiance\").reset_index(drop=True)\n", + "df[\"julian_date\"] = pd.to_datetime(df[\"julian_date\"], origin=\"julian\", unit=\"D\")" ] }, { @@ -94,7 +99,11 @@ "metadata": {}, "outputs": [], "source": [ - "df0 = df.groupby(['latitude',\t'longitude']).agg({'julian_date': lambda x:x.unique(),'time': lambda x:x.unique()}).reset_index(drop=False)" + "df0 = (\n", + " df.groupby([\"latitude\", \"longitude\"])\n", + " .agg({\"julian_date\": lambda x: x.unique(), \"time\": lambda x: x.unique()})\n", + " .reset_index(drop=False)\n", + ")" ] }, { @@ -283,7 +292,7 @@ "source": [ "geolocator = Nominatim(user_agent=\"geoapi\")\n", "# Convert to GeoDataFrame\n", - "df['geometry'] = [Point(x,y) for x,y in zip(df['longitude'], df['latitude'])]\n" + "df[\"geometry\"] = [Point(x, y) for x, y in zip(df[\"longitude\"], df[\"latitude\"])]" ] }, { @@ -306,7 +315,7 @@ } ], "source": [ - "points = df['geometry'].unique()" + "points = df[\"geometry\"].unique()" ] }, { @@ -315,7 +324,7 @@ "metadata": {}, "outputs": [], "source": [ - "df3 = df.drop_duplicates(subset='geometry').reset_index(drop=True)" + "df3 = df.drop_duplicates(subset=\"geometry\").reset_index(drop=True)" ] }, { @@ -371,7 +380,7 @@ } ], "source": [ - "df3['address'] = df3['geometry'].apply(lambda x: gpd.tools.reverse_geocode(x))\n" + "df3[\"address\"] = df3[\"geometry\"].apply(lambda x: gpd.tools.reverse_geocode(x))" ] }, { @@ -380,7 +389,7 @@ "metadata": {}, "outputs": [], "source": [ - "geo_list = df['geometry'].to_list()\n", + "geo_list = df[\"geometry\"].to_list()\n", "i = 0" ] }, @@ -390,7 +399,7 @@ "metadata": {}, "outputs": [], "source": [ - "i=0" + "i = 0" ] }, { @@ -410,7 +419,7 @@ } ], "source": [ - "address['address'].to_list()[0]==None" + "address[\"address\"].to_list()[0] == None" ] }, { @@ -447,8 +456,8 @@ } ], "source": [ - "if address['address'].to_list()[0]==None:\n", - " print('hey')" + "if address[\"address\"].to_list()[0] == None:\n", + " print(\"hey\")" ] }, { @@ -8665,14 +8674,14 @@ "for _ in range(len(geo_list)):\n", " try:\n", " address = gpd.tools.reverse_geocode(geo_list[i], timeout=60)\n", - " if address['address'].to_list()[0]!=None:\n", - " print(address['address'], i)\n", + " if address[\"address\"].to_list()[0] != None:\n", + " print(address[\"address\"], i)\n", " address_i.append(address)\n", " i_i.append(i)\n", " except Exception as e:\n", " pass\n", " print(e, i)\n", - " i += 1\n" + " i += 1" ] }, { @@ -8681,17 +8690,17 @@ "metadata": {}, "outputs": [], "source": [ - "\n", "# Add a reverse geocoding function\n", "def get_postcode(point):\n", " location = geolocator.reverse((point.y, point.x), exactly_one=True)\n", - " if location and 'postcode' in location.raw['address']:\n", - " return location.raw['address']['postcode']\n", + " if location and \"postcode\" in location.raw[\"address\"]:\n", + " return location.raw[\"address\"][\"postcode\"]\n", " return None\n", "\n", + "\n", "# Apply with progress bar\n", "tqdm.pandas()\n", - "gdf['Postcode'] = gdf['geometry'].progress_apply(get_postcode)\n", + "gdf[\"Postcode\"] = gdf[\"geometry\"].progress_apply(get_postcode)\n", "\n", "print(gdf)" ] @@ -8717,7 +8726,7 @@ } ], "source": [ - "np.sort(df0.loc[0, 'julian_date'])" + "np.sort(df0.loc[0, \"julian_date\"])" ] }, { @@ -8741,7 +8750,7 @@ } ], "source": [ - "np.sort(df0.loc[10, 'julian_date'])" + "np.sort(df0.loc[10, \"julian_date\"])" ] }, { @@ -8765,7 +8774,7 @@ } ], "source": [ - "np.sort(df0.loc[22, 'time'])" + "np.sort(df0.loc[22, \"time\"])" ] }, { @@ -8774,12 +8783,24 @@ "metadata": {}, "outputs": [], "source": [ - "\n", "# print(df.columns)\n", "# print(df['julian_date'][0])\n", - "df = df.query('direct_normal_irradiance > 0').drop_duplicates(subset='julian_date').loc[:, ['julian_date', 'direct_normal_irradiance', 'surface_global_irradiance',\n", - " 'surface_diffuse_irradiance', 'cloud_optical_depth', 'cloud_type']]\n", - "print(df['julian_date'].min(), df['julian_date'].max())\n", + "df = (\n", + " df.query(\"direct_normal_irradiance > 0\")\n", + " .drop_duplicates(subset=\"julian_date\")\n", + " .loc[\n", + " :,\n", + " [\n", + " \"julian_date\",\n", + " \"direct_normal_irradiance\",\n", + " \"surface_global_irradiance\",\n", + " \"surface_diffuse_irradiance\",\n", + " \"cloud_optical_depth\",\n", + " \"cloud_type\",\n", + " ],\n", + " ]\n", + ")\n", + "print(df[\"julian_date\"].min(), df[\"julian_date\"].max())\n", "print(df[:30])" ] } diff --git a/BOM_NCI/process_SA.ipynb b/BOM_NCI/process_SA.ipynb index 28765f3..7b631ac 100755 --- a/BOM_NCI/process_SA.ipynb +++ b/BOM_NCI/process_SA.ipynb @@ -7,10 +7,13 @@ "outputs": [], "source": [ "import sys\n", - "sys.path.append('../') \n", - "from visualisation import *\n", + "\n", + "sys.path.append(\"../\")\n", + "import concurrent.futures\n", + "\n", "import xarray as xr\n", - "import concurrent.futures" + "\n", + "from visualisation import *" ] }, { @@ -19,7 +22,7 @@ "metadata": {}, "outputs": [], "source": [ - "df_address = pd.read_csv('Adelide_postcode_points.csv')" + "df_address = pd.read_csv(\"Adelide_postcode_points.csv\")" ] }, { @@ -40,7 +43,7 @@ ], "source": [ "bom_path = \"/mnt/d/bom_nci/2023/**/**/\"\n", - "files = glob(bom_path+\"*.nc\")\n", + "files = glob(bom_path + \"*.nc\")\n", "len(files)" ] }, @@ -51,9 +54,15 @@ "outputs": [], "source": [ "def process_file(file):\n", - " return xr.open_dataset(file).to_dataframe().reset_index(drop=False).merge(df_address, on=['latitude', 'longitude'], how='inner')\n", + " return (\n", + " xr.open_dataset(file)\n", + " .to_dataframe()\n", + " .reset_index(drop=False)\n", + " .merge(df_address, on=[\"latitude\", \"longitude\"], how=\"inner\")\n", + " )\n", "\n", - "num_cores = 10 \n" + "\n", + "num_cores = 10" ] }, { @@ -62,12 +71,11 @@ "metadata": {}, "outputs": [], "source": [ - "\n", "with concurrent.futures.ProcessPoolExecutor(max_workers=num_cores) as executor:\n", " df_list = list(executor.map(process_file, files))\n", "\n", "df = pd.concat(df_list, axis=0).reset_index(drop=True)\n", - "df = df.dropna(subset='direct_normal_irradiance').reset_index(drop=True)\n" + "df = df.dropna(subset=\"direct_normal_irradiance\").reset_index(drop=True)" ] }, { @@ -76,7 +84,7 @@ "metadata": {}, "outputs": [], "source": [ - "df.to_csv('/mnt/d/bom_nci/2023/NCI_processed_Adelide.csv', index=False)" + "df.to_csv(\"/mnt/d/bom_nci/2023/NCI_processed_Adelide.csv\", index=False)" ] }, { @@ -96,7 +104,7 @@ } ], "source": [ - "os.path.getsize('/mnt/d/bom_nci/2023/NCI_processed_Adelide.csv') / (1024 * 1024)\n" + "os.path.getsize(\"/mnt/d/bom_nci/2023/NCI_processed_Adelide.csv\") / (1024 * 1024)" ] }, { @@ -105,7 +113,7 @@ "metadata": {}, "outputs": [], "source": [ - "df = pd.read_csv('/mnt/d/bom_nci/2023/NCI_processed_Adelide.csv')" + "df = pd.read_csv(\"/mnt/d/bom_nci/2023/NCI_processed_Adelide.csv\")" ] }, { @@ -114,7 +122,7 @@ "metadata": {}, "outputs": [], "source": [ - "df.drop(columns=['latitude', 'longitude', 'crs', 'julian_date'], inplace=True)" + "df.drop(columns=[\"latitude\", \"longitude\", \"crs\", \"julian_date\"], inplace=True)" ] }, { @@ -215,7 +223,9 @@ "metadata": {}, "outputs": [], "source": [ - "df.groupby(['time', 'postcode']).mean().reset_index().to_csv('/mnt/d/bom_nci/2023/NCI_processed_Adelaide_grouped.csv', index=False)" + "df.groupby([\"time\", \"postcode\"]).mean().reset_index().to_csv(\n", + " \"/mnt/d/bom_nci/2023/NCI_processed_Adelaide_grouped.csv\", index=False\n", + ")" ] }, { @@ -224,7 +234,7 @@ "metadata": {}, "outputs": [], "source": [ - "df=pd.read_csv('/mnt/d/bom_nci/2023/NCI_processed_Adelaide_grouped.csv')" + "df = pd.read_csv(\"/mnt/d/bom_nci/2023/NCI_processed_Adelaide_grouped.csv\")" ] }, { diff --git a/BOM_NCI/process_bom.ipynb b/BOM_NCI/process_bom.ipynb index 53f8aca..c835c2a 100755 --- a/BOM_NCI/process_bom.ipynb +++ b/BOM_NCI/process_bom.ipynb @@ -7,10 +7,13 @@ "outputs": [], "source": [ "import sys\n", - "sys.path.append('../') \n", - "from visualisation import *\n", + "\n", + "sys.path.append(\"../\")\n", + "import concurrent.futures\n", + "\n", "import xarray as xr\n", - "import concurrent.futures" + "\n", + "from visualisation import *" ] }, { @@ -30,7 +33,7 @@ } ], "source": [ - "os.path.getsize('G-NAF/')/1024" + "os.path.getsize(\"G-NAF/\") / 1024" ] }, { @@ -39,9 +42,9 @@ "metadata": {}, "outputs": [], "source": [ - "df_address = pd.read_csv('bom_postcodes_points.csv')\n", - "df_address['longitude'] = df_address['longitude'].round(2).astype(str)\n", - "df_address['latitude'] = df_address['latitude'].round(2).astype(str)" + "df_address = pd.read_csv(\"bom_postcodes_points.csv\")\n", + "df_address[\"longitude\"] = df_address[\"longitude\"].round(2).astype(str)\n", + "df_address[\"latitude\"] = df_address[\"latitude\"].round(2).astype(str)" ] }, { @@ -61,7 +64,7 @@ } ], "source": [ - "5068 in df_address['nearest_postcode'].values" + "5068 in df_address[\"nearest_postcode\"].values" ] }, { @@ -95,17 +98,18 @@ "def process_file(file):\n", " try:\n", " df = xr.open_dataset(file).to_dataframe().reset_index(drop=False)\n", - " df = df.dropna(subset='surface_global_irradiance').reset_index(drop=True)\n", - " df['longitude'] = df['longitude'].round(2).astype(str)\n", - " df['latitude'] = df['latitude'].round(2).astype(str)\n", - " df = df.merge(df_address, on=['latitude', 'longitude'], how='inner')\n", - " df.drop(columns=['latitude', 'longitude', 'crs', 'julian_date'], inplace=True)\n", + " df = df.dropna(subset=\"surface_global_irradiance\").reset_index(drop=True)\n", + " df[\"longitude\"] = df[\"longitude\"].round(2).astype(str)\n", + " df[\"latitude\"] = df[\"latitude\"].round(2).astype(str)\n", + " df = df.merge(df_address, on=[\"latitude\", \"longitude\"], how=\"inner\")\n", + " df.drop(columns=[\"latitude\", \"longitude\", \"crs\", \"julian_date\"], inplace=True)\n", " return df\n", " except Exception as e:\n", " print(f\"Error processing file {file}: {e}\")\n", " return None\n", "\n", - "num_cores = 12 \n" + "\n", + "num_cores = 12" ] }, { @@ -114,11 +118,10 @@ "metadata": {}, "outputs": [], "source": [ - "\n", "with concurrent.futures.ProcessPoolExecutor(max_workers=num_cores) as executor:\n", " df_list = list(executor.map(process_file, files))\n", "\n", - "df = pd.concat(df_list, axis=0).reset_index(drop=True)\n" + "df = pd.concat(df_list, axis=0).reset_index(drop=True)" ] }, { @@ -348,7 +351,9 @@ } ], "source": [ - "df.query(f\"time=='2022-12-31 18:30:00' and nearest_postcode==872 and direct_normal_irradiance==direct_normal_irradiance\")" + "df.query(\n", + " f\"time=='2022-12-31 18:30:00' and nearest_postcode==872 and direct_normal_irradiance==direct_normal_irradiance\"\n", + ")" ] }, { @@ -357,8 +362,8 @@ "metadata": {}, "outputs": [], "source": [ - "df.rename(columns={'nearest_postcode': 'postcode'}, inplace=True)\n", - "df = df.groupby(['time', 'postcode']).mean().reset_index()" + "df.rename(columns={\"nearest_postcode\": \"postcode\"}, inplace=True)\n", + "df = df.groupby([\"time\", \"postcode\"]).mean().reset_index()" ] }, { @@ -378,7 +383,7 @@ } ], "source": [ - "df['postcode'].unique().shape" + "df[\"postcode\"].unique().shape" ] }, { @@ -519,8 +524,13 @@ } ], "source": [ - "df0 = df.query(f\"surface_global_irradiance==surface_global_irradiance\").groupby('postcode').agg({'time': ['min', 'max', 'count']}).reset_index()\n", - "df0.columns = ['_'.join(col).strip() for col in df0.columns.values]\n", + "df0 = (\n", + " df.query(f\"surface_global_irradiance==surface_global_irradiance\")\n", + " .groupby(\"postcode\")\n", + " .agg({\"time\": [\"min\", \"max\", \"count\"]})\n", + " .reset_index()\n", + ")\n", + "df0.columns = [\"_\".join(col).strip() for col in df0.columns.values]\n", "df0" ] }, @@ -810,8 +820,8 @@ } ], "source": [ - "df0 = df.sort_values(by='time')\n", - "df0.groupby('postcode').apply(lambda x:x.diff().quantile(.99))" + "df0 = df.sort_values(by=\"time\")\n", + "df0.groupby(\"postcode\").apply(lambda x: x.diff().quantile(0.99))" ] }, { @@ -827,7 +837,9 @@ "metadata": {}, "outputs": [], "source": [ - "df.to_csv('/home/hossein/CICCADA/BOM_NCI/2022/NCI_processed_grouped_Nov.csv', index=False)" + "df.to_csv(\n", + " \"/home/hossein/CICCADA/BOM_NCI/2022/NCI_processed_grouped_Nov.csv\", index=False\n", + ")" ] }, { @@ -847,7 +859,9 @@ } ], "source": [ - "os.path.getsize('/home/hossein/CICCADA/BOM_NCI/2022/NCI_processed_grouped_Nov.csv') / (1024 * 1024)" + "os.path.getsize(\"/home/hossein/CICCADA/BOM_NCI/2022/NCI_processed_grouped_Nov.csv\") / (\n", + " 1024 * 1024\n", + ")" ] }, { @@ -856,7 +870,7 @@ "metadata": {}, "outputs": [], "source": [ - "df = pd.read_csv('/home/hossein/CICCADA/BOM_NCI/2022/NCI_processed_grouped_Nov.csv')" + "df = pd.read_csv(\"/home/hossein/CICCADA/BOM_NCI/2022/NCI_processed_grouped_Nov.csv\")" ] }, { @@ -876,7 +890,7 @@ } ], "source": [ - "5068 in df['postcode'].unique()" + "5068 in df[\"postcode\"].unique()" ] }, { @@ -896,7 +910,7 @@ } ], "source": [ - "df['time'].min(), df['time'].max(), df['postcode'].unique().shape" + "df[\"time\"].min(), df[\"time\"].max(), df[\"postcode\"].unique().shape" ] }, { @@ -905,7 +919,13 @@ "metadata": {}, "outputs": [], "source": [ - "df = pd.concat([pd.read_csv(f) for f in glob('/home/hossein/CICCADA/BOM_NCI/2023/NCI_processed_grouped_*.csv')], axis=0).reset_index(drop=True)" + "df = pd.concat(\n", + " [\n", + " pd.read_csv(f)\n", + " for f in glob(\"/home/hossein/CICCADA/BOM_NCI/2023/NCI_processed_grouped_*.csv\")\n", + " ],\n", + " axis=0,\n", + ").reset_index(drop=True)" ] }, { @@ -925,7 +945,7 @@ } ], "source": [ - "df['time'].min(), df['time'].max(), df['postcode'].unique().shape" + "df[\"time\"].min(), df[\"time\"].max(), df[\"postcode\"].unique().shape" ] }, { @@ -934,7 +954,9 @@ "metadata": {}, "outputs": [], "source": [ - "df.to_csv('/home/hossein/CICCADA/BOM_NCI/2023/NCI_processed_grouped_all.csv', index=False)" + "df.to_csv(\n", + " \"/home/hossein/CICCADA/BOM_NCI/2023/NCI_processed_grouped_all.csv\", index=False\n", + ")" ] } ], diff --git a/BOM_NCI/rclone.ipynb b/BOM_NCI/rclone.ipynb index 43e7672..0724f27 100755 --- a/BOM_NCI/rclone.ipynb +++ b/BOM_NCI/rclone.ipynb @@ -6,7 +6,8 @@ "metadata": {}, "outputs": [], "source": [ - "import subprocess, os" + "import os\n", + "import subprocess" ] }, { @@ -24,10 +25,15 @@ } ], "source": [ - "\n", - "result = subprocess.run(\"eval $(ssh-agent -s)\", env=env, shell=True, capture_output=True, text=True)\n", - "result = subprocess.run(\"ssh-add ~/.ssh/id_rsa_darth\", env=env, shell=True, capture_output=True, text=True)\n", - "result = subprocess.run(\"echo $SSH_AUTH_SOCK\", env=env, shell=True, capture_output=True, text=True)\n", + "result = subprocess.run(\n", + " \"eval $(ssh-agent -s)\", env=env, shell=True, capture_output=True, text=True\n", + ")\n", + "result = subprocess.run(\n", + " \"ssh-add ~/.ssh/id_rsa_darth\", env=env, shell=True, capture_output=True, text=True\n", + ")\n", + "result = subprocess.run(\n", + " \"echo $SSH_AUTH_SOCK\", env=env, shell=True, capture_output=True, text=True\n", + ")\n", "\n", "print(result.stdout)" ] @@ -59,7 +65,9 @@ "# ssh-add ~/.ssh/id_rsa_darth\n", "# eval \"$(ssh-agent -s)\"\n", "env = os.environ.copy()\n", - "os.environ[\"SSH_AUTH_SOCK\"] = '/tmp/ssh-XXXXXXeSFgMQ/agent.9315' # Use the actual value from terminal\n", + "os.environ[\"SSH_AUTH_SOCK\"] = (\n", + " \"/tmp/ssh-XXXXXXeSFgMQ/agent.9315\" # Use the actual value from terminal\n", + ")\n", "# Verify if SSH_AUTH_SOCK is set\n", "if \"SSH_AUTH_SOCK\" in env:\n", " print(f\"SSH_AUTH_SOCK: {env['SSH_AUTH_SOCK']}\")\n", @@ -67,7 +75,9 @@ " print(\"SSH_AUTH_SOCK is not set!\")\n", "\n", "# Run rclone command with environment variables\n", - "result = subprocess.run([\"rclone\", \"lsd\", \"bom_nci:\"], env=env, capture_output=True, text=True)\n", + "result = subprocess.run(\n", + " [\"rclone\", \"lsd\", \"bom_nci:\"], env=env, capture_output=True, text=True\n", + ")\n", "\n", "print(result.stdout)\n", "print(result.stderr)" @@ -95,7 +105,16 @@ } ], "source": [ - "result = subprocess.run([\"rclone\", \"lsd\", \"bom_nci:/g/data/rv74/satellite-products/arc/der/himawari-ahi/solar/p1s/v1.1/\"], env=env, capture_output=True, text=True)\n", + "result = subprocess.run(\n", + " [\n", + " \"rclone\",\n", + " \"lsd\",\n", + " \"bom_nci:/g/data/rv74/satellite-products/arc/der/himawari-ahi/solar/p1s/v1.1/\",\n", + " ],\n", + " env=env,\n", + " capture_output=True,\n", + " text=True,\n", + ")\n", "\n", "print(result.stdout)\n", "print(result.stderr)" diff --git a/Data_query/data_quer_trino.ipynb b/Data_query/data_quer_trino.ipynb index 67d010e..bc5c598 100644 --- a/Data_query/data_quer_trino.ipynb +++ b/Data_query/data_quer_trino.ipynb @@ -7,9 +7,9 @@ "metadata": {}, "outputs": [], "source": [ - "from trino_config import *\n", "import pandas as pd\n", - "import pytz" + "import pytz\n", + "from trino_config import *" ] }, { @@ -838,7 +838,10 @@ } ], "source": [ - "pd.read_sql(\"select * from sola_circuits as c left join sola_sites as s on c.site_id = s.site_id where circuit_id=547781\", engine)" + "pd.read_sql(\n", + " \"select * from sola_circuits as c left join sola_sites as s on c.site_id = s.site_id where circuit_id=547781\",\n", + " engine,\n", + ")" ] }, { @@ -848,8 +851,8 @@ "metadata": {}, "outputs": [], "source": [ - "t0 = '2025-01-01 00:00:00+00:00'\n", - "t1 = '2025-01-03 00:00:00+00:00'" + "t0 = \"2025-01-01 00:00:00+00:00\"\n", + "t1 = \"2025-01-03 00:00:00+00:00\"" ] }, { @@ -904,7 +907,10 @@ } ], "source": [ - "pd.read_sql(f\"\"\"select circuit_id, t_stamp from SolA_ts4 where year =2024 and is_pv=True and month=1 limit 1\"\"\", engine)" + "pd.read_sql(\n", + " f\"\"\"select circuit_id, t_stamp from SolA_ts4 where year =2024 and is_pv=True and month=1 limit 1\"\"\",\n", + " engine,\n", + ")" ] }, { @@ -1012,9 +1018,12 @@ } ], "source": [ - "pd.read_sql(f\"SELECT * \\\n", + "pd.read_sql(\n", + " f\"SELECT * \\\n", " from SolA_ts4 as ts left join SolA_circuits as c on ts.circuit_id = c.circuit_id left join SolA_sites as s on c.site_id = s.site_id \\\n", - " limit 1 \", engine)" + " limit 1 \",\n", + " engine,\n", + ")" ] }, { @@ -1024,11 +1033,14 @@ "metadata": {}, "outputs": [], "source": [ - "df = pd.read_sql(f\"SELECT s.site_id, t_stamp, avg(voltage) as avg_voltage, avg(power) as avg_power, avg(energy) as avg_energy \\\n", + "df = pd.read_sql(\n", + " f\"SELECT s.site_id, t_stamp, avg(voltage) as avg_voltage, avg(power) as avg_power, avg(energy) as avg_energy \\\n", " from SolA_ts4 as ts left join SolA_circuits as c on ts.circuit_id = c.circuit_id left join SolA_sites as s on c.site_id = s.site_id \\\n", " where ts.is_pv = True and month=1 and year=2025 and state='NSW' \\\n", " and t_stamp >= TIMESTAMP '{t0}' and t_stamp < TIMESTAMP '{t1}' \\\n", - " group by s.site_id, t_stamp \", engine)" + " group by s.site_id, t_stamp \",\n", + " engine,\n", + ")" ] }, { @@ -1038,11 +1050,14 @@ "metadata": {}, "outputs": [], "source": [ - "df = pd.read_sql(f\"SELECT s.site_id, t_stamp, avg(voltage) as avg_voltage, avg(power) as avg_power, avg(energy) as avg_energy \\\n", + "df = pd.read_sql(\n", + " f\"SELECT s.site_id, t_stamp, avg(voltage) as avg_voltage, avg(power) as avg_power, avg(energy) as avg_energy \\\n", " from SolA_ts5 as ts left join SolA_circuits as c on ts.circuit_id = c.circuit_id left join SolA_sites as s on c.site_id = s.site_id \\\n", " where ts.is_pv = True and month=1 and year=2025 and state='NSW' \\\n", " and t_stamp >= TIMESTAMP '{t0}' and t_stamp < TIMESTAMP '{t1}' \\\n", - " group by s.site_id, t_stamp \", engine)" + " group by s.site_id, t_stamp \",\n", + " engine,\n", + ")" ] }, { @@ -1101,9 +1116,11 @@ } ], "source": [ - "t0 = '2025-01-01 00:00:00+11:00'\n", - "t1 = '2025-02-01 00:00:00+11:00'\n", - "pd.read_sql((f\"\"\"\n", + "t0 = \"2025-01-01 00:00:00+11:00\"\n", + "t1 = \"2025-02-01 00:00:00+11:00\"\n", + "pd.read_sql(\n", + " (\n", + " f\"\"\"\n", " select circuit_id, count( t_stamp) as t_stamp_count, min(t_stamp ) as min_t_stamp, max(t_stamp ) as max_t_stamp\n", " from SolA_ts4 \n", " where is_pv = True and month=1 and year=2025 and circuit_id = 547781 \n", @@ -1111,7 +1128,10 @@ " \n", " group by circuit_id \n", " order by t_stamp_count desc, circuit_id desc\n", - "\"\"\"), engine)" + "\"\"\"\n", + " ),\n", + " engine,\n", + ")" ] }, { @@ -1162,14 +1182,18 @@ } ], "source": [ - "\n", - "pd.read_sql((f\"\"\"\n", + "pd.read_sql(\n", + " (\n", + " f\"\"\"\n", " select circuit_id, count(distinct t_stamp) as t_stamp_count\n", " from SolA_ts4 \n", " where is_pv = False and month=1 and year=2025 and circuit_id = 547781 \n", " group by circuit_id \n", " order by t_stamp_count desc, circuit_id desc\n", - "\"\"\"), engine)\n" + "\"\"\"\n", + " ),\n", + " engine,\n", + ")" ] }, { @@ -1260,11 +1284,16 @@ } ], "source": [ - "pd.read_sql((f\"\"\"\n", + "pd.read_sql(\n", + " (\n", + " f\"\"\"\n", " select *\n", " from SolA_ts5 \n", " limit 1\n", - "\"\"\"), engine)\n" + "\"\"\"\n", + " ),\n", + " engine,\n", + ")" ] }, { @@ -1319,12 +1348,17 @@ } ], "source": [ - "t0 = '2025-01-01 00:00:00+00:00'\n", - "t1 = '2025-02-01 00:00:00+00:00'\n", - "df = pd.read_sql((f\"\"\"\n", + "t0 = \"2025-01-01 00:00:00+00:00\"\n", + "t1 = \"2025-02-01 00:00:00+00:00\"\n", + "df = pd.read_sql(\n", + " (\n", + " f\"\"\"\n", " select max(t_stamp) as max_t_stamp, min(t_stamp) as min_t_stamp from SolA_ts4\n", " where is_pv = False and t_stamp >= TIMESTAMP '{t0}' and t_stamp < TIMESTAMP '{t1}' \n", - "\"\"\"), engine)\n", + "\"\"\"\n", + " ),\n", + " engine,\n", + ")\n", "df" ] }, @@ -1388,12 +1422,17 @@ } ], "source": [ - "t0 = '2025-01-01 00:00:00+00:00'\n", - "t1 = '2025-02-01 00:00:00+00:00'\n", - "df = pd.read_sql((f\"\"\"\n", + "t0 = \"2025-01-01 00:00:00+00:00\"\n", + "t1 = \"2025-02-01 00:00:00+00:00\"\n", + "df = pd.read_sql(\n", + " (\n", + " f\"\"\"\n", " select max(t_stamp) as max_t_stamp, min(t_stamp) as min_t_stamp from SolA_ts5\n", " where is_pv = True\n", - "\"\"\"), engine)\n", + "\"\"\"\n", + " ),\n", + " engine,\n", + ")\n", "df" ] }, @@ -1416,7 +1455,7 @@ } ], "source": [ - "df['max_t_stamp']" + "df[\"max_t_stamp\"]" ] }, { @@ -1480,13 +1519,18 @@ } ], "source": [ - "df = pd.read_sql((f\"\"\"\n", + "df = pd.read_sql(\n", + " (\n", + " f\"\"\"\n", " select *\n", " from SolA_ts \n", " where is_pv = True and month=1 and year=2024 and circuit_id = 547781 \n", " and t_stamp >= TIMESTAMP '{t0}' and t_stamp < TIMESTAMP '{t1}'\n", " order by t_stamp desc\n", - "\"\"\"), engine)\n", + "\"\"\"\n", + " ),\n", + " engine,\n", + ")\n", "df" ] }, @@ -1517,8 +1561,8 @@ "metadata": {}, "outputs": [], "source": [ - "df['t_stamp'] = pd.to_datetime(df['t_stamp'], utc=True)\n", - "df['t_stamp'] = df['t_stamp'].dt.tz_convert(pytz.FixedOffset(600)) # Convert to UTC+10" + "df[\"t_stamp\"] = pd.to_datetime(df[\"t_stamp\"], utc=True)\n", + "df[\"t_stamp\"] = df[\"t_stamp\"].dt.tz_convert(pytz.FixedOffset(600)) # Convert to UTC+10" ] }, { @@ -1528,7 +1572,7 @@ "metadata": {}, "outputs": [], "source": [ - "df['t_stamp'].min(), df['t_stamp'].max()" + "df[\"t_stamp\"].min(), df[\"t_stamp\"].max()" ] } ], diff --git a/Data_query/data_query.ipynb b/Data_query/data_query.ipynb index 9d12918..3f32da6 100644 --- a/Data_query/data_query.ipynb +++ b/Data_query/data_query.ipynb @@ -19,9 +19,9 @@ } ], "source": [ - "from spark_config import *\n", "import pandas as pd\n", - "import pytz\n" + "import pytz\n", + "from spark_config import *" ] }, { @@ -97,9 +97,11 @@ } ], "source": [ - "t0 = '2025-01-01 00:00:00+00:00'\n", - "t1 = '2025-02-01 00:00:00+00:00'\n", - "spark.sql(f\"select max(t_stamp), min(t_stamp) from sola_ts4 where t_stamp >= '{t0}' and t_stamp < '{t1}'\").show()" + "t0 = \"2025-01-01 00:00:00+00:00\"\n", + "t1 = \"2025-02-01 00:00:00+00:00\"\n", + "spark.sql(\n", + " f\"select max(t_stamp), min(t_stamp) from sola_ts4 where t_stamp >= '{t0}' and t_stamp < '{t1}'\"\n", + ").show()" ] }, { @@ -109,7 +111,9 @@ "metadata": {}, "outputs": [], "source": [ - "df = spark.read.parquet(\"s3a://project-ciccada/spark-warehouse/bucketed_table/year=2025/month=1\")" + "df = spark.read.parquet(\n", + " \"s3a://project-ciccada/spark-warehouse/bucketed_table/year=2025/month=1\"\n", + ")" ] }, { @@ -156,11 +160,13 @@ } ], "source": [ - "spark.sql(f\"select circuit_id, count(distinct t_stamp) as t_stamp_count \\\n", + "spark.sql(\n", + " f\"select circuit_id, count(distinct t_stamp) as t_stamp_count \\\n", " from df \\\n", " where t_stamp >= '{t0}' and t_stamp < '{t1}' and circuit_id = 547781 \\\n", " group by circuit_id \\\n", - " order by t_stamp_count desc, circuit_id desc\").show()" + " order by t_stamp_count desc, circuit_id desc\"\n", + ").show()" ] }, { @@ -191,13 +197,15 @@ } ], "source": [ - "t0 = '2025-01-01 00:00:00+00:00'\n", - "t1 = '2025-01-03 00:00:00+00:00'\n", - "spark.sql(f\"select circuit_id, count( t_stamp) as t_stamp_count \\\n", + "t0 = \"2025-01-01 00:00:00+00:00\"\n", + "t1 = \"2025-01-03 00:00:00+00:00\"\n", + "spark.sql(\n", + " f\"select circuit_id, count( t_stamp) as t_stamp_count \\\n", " from sola_ts4 \\\n", " where t_stamp >= '{t0}' and t_stamp < '{t1}' and circuit_id = 547781 \\\n", " group by circuit_id \\\n", - " order by t_stamp_count desc, circuit_id desc\").show()" + " order by t_stamp_count desc, circuit_id desc\"\n", + ").show()" ] }, { @@ -263,7 +271,7 @@ } ], "source": [ - "spark.sql(\"SHOW TABLES\").show()\n" + "spark.sql(\"SHOW TABLES\").show()" ] }, { @@ -325,7 +333,7 @@ } ], "source": [ - "spark.sql(\"describe formatted SolA_ts\").show(50, truncate=False)\n" + "spark.sql(\"describe formatted SolA_ts\").show(50, truncate=False)" ] }, { @@ -335,21 +343,23 @@ "metadata": {}, "outputs": [], "source": [ - "ts_schema = StructType([\n", - " StructField(\"device_id\", LongType()),\n", - " StructField(\"circuit_id\", LongType()),\n", - " StructField(\"t_stamp\", TimestampType()),\n", - " StructField(\"power\", DoubleType()),\n", - " StructField(\"energy\", DoubleType()),\n", - " StructField(\"energy_reactive\", DoubleType()),\n", - " StructField(\"energy_import\", DoubleType()),\n", - " StructField(\"energy_export\", DoubleType()),\n", - " StructField(\"energy_reactive_import\", DoubleType()),\n", - " StructField(\"energy_reactive_export\", DoubleType()),\n", - " StructField(\"power_factor\", DoubleType()),\n", - " StructField(\"voltage\", DoubleType()),\n", - " StructField(\"current\", DoubleType()),\n", - "])" + "ts_schema = StructType(\n", + " [\n", + " StructField(\"device_id\", LongType()),\n", + " StructField(\"circuit_id\", LongType()),\n", + " StructField(\"t_stamp\", TimestampType()),\n", + " StructField(\"power\", DoubleType()),\n", + " StructField(\"energy\", DoubleType()),\n", + " StructField(\"energy_reactive\", DoubleType()),\n", + " StructField(\"energy_import\", DoubleType()),\n", + " StructField(\"energy_export\", DoubleType()),\n", + " StructField(\"energy_reactive_import\", DoubleType()),\n", + " StructField(\"energy_reactive_export\", DoubleType()),\n", + " StructField(\"power_factor\", DoubleType()),\n", + " StructField(\"voltage\", DoubleType()),\n", + " StructField(\"current\", DoubleType()),\n", + " ]\n", + ")" ] }, { @@ -359,8 +369,8 @@ "metadata": {}, "outputs": [], "source": [ - "t0 = '2024-01-02 00:00:00+10:00'\n", - "t1 = '2024-01-03 00:00:00+10:00'" + "t0 = \"2024-01-02 00:00:00+10:00\"\n", + "t1 = \"2024-01-03 00:00:00+10:00\"" ] }, { @@ -679,9 +689,19 @@ } ], "source": [ - "df1 = spark.read.option(\"recursiveFileLookup\", \"true\").schema(ts_schema)\\\n", - ".parquet(f\"s3a://project-ciccada/spark-warehouse/bucketed_table2/year=2024/month=1/\", \n", - " escape='\"', multiLine=True, quote='\"', encoding='UTF-8').filter(col(\"circuit_id\") == 547781).toPandas()" + "df1 = (\n", + " spark.read.option(\"recursiveFileLookup\", \"true\")\n", + " .schema(ts_schema)\n", + " .parquet(\n", + " f\"s3a://project-ciccada/spark-warehouse/bucketed_table2/year=2024/month=1/\",\n", + " escape='\"',\n", + " multiLine=True,\n", + " quote='\"',\n", + " encoding=\"UTF-8\",\n", + " )\n", + " .filter(col(\"circuit_id\") == 547781)\n", + " .toPandas()\n", + ")" ] }, { @@ -723,9 +743,11 @@ } ], "source": [ - "df1['t_stamp'] = df1['t_stamp'].dt.tz_localize('UTC').dt.tz_convert(pytz.FixedOffset(600))\n", + "df1[\"t_stamp\"] = (\n", + " df1[\"t_stamp\"].dt.tz_localize(\"UTC\").dt.tz_convert(pytz.FixedOffset(600))\n", + ")\n", "df2 = df1.query(f\"t_stamp >= '{t0}' and t_stamp < '{t1}'\")\n", - "df2[df2.duplicated()].shape\n" + "df2[df2.duplicated()].shape" ] }, { @@ -735,9 +757,18 @@ "metadata": {}, "outputs": [], "source": [ - "df1 = spark.read.schema(ts_schema)\\\n", - ".parquet(f\"s3a://project-ciccada/spark-warehouse/bucketed_table2/year=2024/month=1/\", \n", - " escape='\"', multiLine=True, quote='\"', encoding='UTF-8').filter(col(\"circuit_id\") == 547781).toPandas()" + "df1 = (\n", + " spark.read.schema(ts_schema)\n", + " .parquet(\n", + " f\"s3a://project-ciccada/spark-warehouse/bucketed_table2/year=2024/month=1/\",\n", + " escape='\"',\n", + " multiLine=True,\n", + " quote='\"',\n", + " encoding=\"UTF-8\",\n", + " )\n", + " .filter(col(\"circuit_id\") == 547781)\n", + " .toPandas()\n", + ")" ] }, { @@ -780,9 +811,11 @@ } ], "source": [ - "df = spark.sql(f\"SELECT circuit_id, t_stamp, power/1000 as power, voltage from SolA_ts \\\n", + "df = spark.sql(\n", + " f\"SELECT circuit_id, t_stamp, power/1000 as power, voltage from SolA_ts \\\n", " where is_pv = True and month=1 and year=2024 \\\n", - " and t_stamp >= '{t0}' and t_stamp < '{t1}'\").toPandas()" + " and t_stamp >= '{t0}' and t_stamp < '{t1}'\"\n", + ").toPandas()" ] }, { @@ -816,9 +849,11 @@ "metadata": {}, "outputs": [], "source": [ - "spark.sql(f\"SELECT circuit_id, t_stamp, power/1000 as power, voltage from SolA_ts \\\n", + "spark.sql(\n", + " f\"SELECT circuit_id, t_stamp, power/1000 as power, voltage from SolA_ts \\\n", " where is_pv = True and month=1 and year=2024 \\\n", - " and t_stamp > '{t0}' and t_stamp < '{t1}' and circuit_id=6066\").show(10, truncate=False)" + " and t_stamp > '{t0}' and t_stamp < '{t1}' and circuit_id=6066\"\n", + ").show(10, truncate=False)" ] }, { @@ -828,8 +863,8 @@ "metadata": {}, "outputs": [], "source": [ - "df['t_stamp'] = pd.to_datetime(df['t_stamp'], utc=True)\n", - "df['t_stamp'] = df['t_stamp'].dt.tz_convert(pytz.FixedOffset(600)) # Convert to UTC+10" + "df[\"t_stamp\"] = pd.to_datetime(df[\"t_stamp\"], utc=True)\n", + "df[\"t_stamp\"] = df[\"t_stamp\"].dt.tz_convert(pytz.FixedOffset(600)) # Convert to UTC+10" ] }, { @@ -839,7 +874,7 @@ "metadata": {}, "outputs": [], "source": [ - "df['t_stamp'].min(), df['t_stamp'].max()" + "df[\"t_stamp\"].min(), df[\"t_stamp\"].max()" ] } ], diff --git a/Data_query/duckdb_config.py b/Data_query/duckdb_config.py index 06f861d..48fbdac 100644 --- a/Data_query/duckdb_config.py +++ b/Data_query/duckdb_config.py @@ -1,22 +1,23 @@ -import duckdb, os +import os + import boto3 +import duckdb + session = boto3.Session() credentials = session.get_credentials().get_frozen_credentials() # s3 = boto3.resource(service_name='s3') -s3 = boto3.client('s3') +s3 = boto3.client("s3") -os.environ['AWS_ACCESS_KEY_ID'] = credentials.access_key -os.environ['AWS_SECRET_ACCESS_KEY'] = credentials.secret_key -os.environ['AWS_SESSION_TOKEN'] = credentials.token +os.environ["AWS_ACCESS_KEY_ID"] = credentials.access_key +os.environ["AWS_SECRET_ACCESS_KEY"] = credentials.secret_key +os.environ["AWS_SESSION_TOKEN"] = credentials.token duckdb.sql("INSTALL httpfs; LOAD httpfs;") duckdb.sql(f""" - SET s3_region='ap-southeast-2'; + SET s3_region='ap-southeast-2'; SET s3_access_key_id='{credentials.access_key}'; SET s3_secret_access_key='{credentials.secret_key}'; SET s3_session_token='{credentials.token}'; """) - - diff --git a/Data_query/spark_config.py b/Data_query/spark_config.py index ff602d3..90efc2b 100644 --- a/Data_query/spark_config.py +++ b/Data_query/spark_config.py @@ -1,4 +1,5 @@ -import os, sys +import os +import sys os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-17-openjdk-amd64" os.environ["PYSPARK_PYTHON"] = os.path.join(os.environ["VIRTUAL_ENV"], "bin", "python") @@ -7,25 +8,66 @@ sys.path.insert(0, os.path.join(spark_home, "python", "lib", "py4j-0.10.9.9-src.zip")) from pyspark.sql import SparkSession +from pyspark.sql.functions import abs as spark_abs +from pyspark.sql.functions import ( + avg, + col, + concat, + date_format, + dayofmonth, + dense_rank, + explode, + expr, + from_json, + greatest, + hour, + lag, + least, + lit, + month, + row_number, + to_date, + to_timestamp, + udf, + unix_timestamp, + when, + year, +) +from pyspark.sql.functions import count as spark_count +from pyspark.sql.functions import max as spark_max +from pyspark.sql.functions import min as spark_min +from pyspark.sql.functions import sum as spark_sum +from pyspark.sql.types import ( + ArrayType, + BooleanType, + DoubleType, + IntegerType, + LongType, + StringType, + StructField, + StructType, + TimestampType, +) from pyspark.sql.window import Window -from pyspark.sql.functions import col, explode, from_json, udf, to_timestamp, to_date, when, dayofmonth, month, year, hour, greatest, lit, \ - sum as spark_sum, lag, min as spark_min, count as spark_count, max as spark_max, expr, avg, least, concat, row_number, dense_rank, \ - date_format, unix_timestamp, abs as spark_abs -from pyspark.sql.types import ArrayType, StructType, StructField, StringType, DoubleType, IntegerType, BooleanType, TimestampType, LongType -spark = SparkSession.builder \ - .appName("S3Access") \ - .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \ - .config("spark.hadoop.fs.s3a.aws.credentials.provider", "software.amazon.awssdk.auth.credentials.DefaultCredentialsProvider") \ - .config("spark.hadoop.fs.s3a.endpoint", "s3.ap-southeast-2.amazonaws.com") \ - .config("spark.sql.warehouse.dir", "s3a://project-ciccada/spark-warehouse/") \ - .config("spark.hadoop.hive.metastore.uris", "thrift://localhost:9083") \ - .config("spark.sql.catalogImplementation", "hive") \ - .config("spark.local.dir", "/mnt/spark-temp") \ - .config("spark.driver.memory", "40g") \ - .config("spark.sql.hive.metastore.jars", "/home/ubuntu/hive-4.0.1/lib/*") \ - .config("spark.sql.hive.metastore.version", "4.0.1") \ - .enableHiveSupport() \ + +spark = ( + SparkSession.builder.appName("S3Access") + .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") + .config( + "spark.hadoop.fs.s3a.aws.credentials.provider", + "software.amazon.awssdk.auth.credentials.DefaultCredentialsProvider", + ) + .config("spark.hadoop.fs.s3a.endpoint", "s3.ap-southeast-2.amazonaws.com") + .config("spark.sql.warehouse.dir", "s3a://project-ciccada/spark-warehouse/") + .config("spark.hadoop.hive.metastore.uris", "thrift://localhost:9083") + .config("spark.sql.catalogImplementation", "hive") + .config("spark.local.dir", "/mnt/spark-temp") + .config("spark.driver.memory", "40g") + .config("spark.sql.hive.metastore.jars", "/home/ubuntu/hive-4.0.1/lib/*") + .config("spark.sql.hive.metastore.version", "4.0.1") + .enableHiveSupport() .getOrCreate() +) - # .config("spark.local.dir", "/home/ubuntu/tmp") \ - # .config("spark.local.dir", "/mnt/spark-temp") \ +# .config("spark.local.dir", "/home/ubuntu/tmp") \ +# .config("spark.local.dir", "/mnt/spark-temp") \ diff --git a/Data_query/trino_config.py b/Data_query/trino_config.py index 0c4b4fe..8215f07 100644 --- a/Data_query/trino_config.py +++ b/Data_query/trino_config.py @@ -1,36 +1,56 @@ -from sqlalchemy.engine import create_engine -from sqlalchemy import text import subprocess as sp -import pandas as pd +from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed from time import sleep -from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed + import boto3 +import pandas as pd +from sqlalchemy import text +from sqlalchemy.engine import create_engine + session = boto3.Session() -s3 = boto3.client('s3') +s3 = boto3.client("s3") pool_size = 8 max_overflow = 20 pool_timeout = 90 -trino_hive = create_engine("trino://ubuntu@trino.ciccada:8080/hive/solar_analytics", pool_size=pool_size, max_overflow=max_overflow, pool_timeout=pool_timeout) -trino_iceberg = create_engine("trino://ubuntu@trino.ciccada:8080/iceberg/solar_analytics_iceberg", pool_size=pool_size, max_overflow=max_overflow, pool_timeout=pool_timeout) -trino_bom = create_engine("trino://ubuntu@trino.ciccada:8080/iceberg/BOM_NCI", pool_size=pool_size, max_overflow=max_overflow, pool_timeout=pool_timeout) +trino_hive = create_engine( + "trino://ubuntu@trino.ciccada:8080/hive/solar_analytics", + pool_size=pool_size, + max_overflow=max_overflow, + pool_timeout=pool_timeout, +) +trino_iceberg = create_engine( + "trino://ubuntu@trino.ciccada:8080/iceberg/solar_analytics_iceberg", + pool_size=pool_size, + max_overflow=max_overflow, + pool_timeout=pool_timeout, +) +trino_bom = create_engine( + "trino://ubuntu@trino.ciccada:8080/iceberg/BOM_NCI", + pool_size=pool_size, + max_overflow=max_overflow, + pool_timeout=pool_timeout, +) + def make_trino_engine(catalog): return create_engine( f"trino://ubuntu@trino.ciccada:8080/{catalog}", pool_size=1, max_overflow=0, - pool_pre_ping=True + pool_pre_ping=True, ) + def hive_sql(query: str) -> pd.DataFrame: - with trino_hive.connect() as conn: + with trino_hive.connect() as conn: df = pd.read_sql(query, conn) return df + def iceberg_sql(query: str) -> pd.DataFrame: engine = make_trino_engine("iceberg/solar_analytics_iceberg") - with engine.connect() as conn: + with engine.connect() as conn: # conn.execute(text("SET SESSION query_max_memory_per_node = '45GB'")) conn.execute(text("SET SESSION task_concurrency = 1")) result = conn.execution_options(stream_results=True).execute(text(query)) @@ -39,17 +59,20 @@ def iceberg_sql(query: str) -> pd.DataFrame: engine.dispose() return df + def iceberg_exec(query): with trino_iceberg.connect() as conn: # conn.execute(text("SET SESSION query_max_run_time = '60m'")) conn.execute(text(query)) print("Executed") + def hive_exec(query): with trino_hive.connect() as conn: conn.execute(text(query)) print("Executed") + def bom_exec(query): with trino_bom.connect() as conn: conn.execute(text(query)) @@ -59,7 +82,7 @@ def bom_exec(query): def trino_parallel_batch(run_func, tasks, num_workers=1, batch_size=4): results = [] for i in range(0, len(tasks), batch_size): - batch = tasks[i:i + batch_size] + batch = tasks[i : i + batch_size] with ProcessPoolExecutor(max_workers=num_workers) as executor: futures = [executor.submit(run_func, task) for task in batch] batch_results = [] @@ -75,7 +98,9 @@ def trino_parallel_batch(run_func, tasks, num_workers=1, batch_size=4): # print(f"Saving batch {i // batch_size + 1}") results.extend(batch_results) sleep(10) # Sleep after processing each batch to reduce load on Trino - if (i // batch_size + 1) % 5 == 0: # Sleep for a longer duration after every 5 batches + if ( + i // batch_size + 1 + ) % 5 == 0: # Sleep for a longer duration after every 5 batches print("Sleeping for 30 seconds to reduce load on Trino...") sleep(30) if results: @@ -84,6 +109,7 @@ def trino_parallel_batch(run_func, tasks, num_workers=1, batch_size=4): return None + def trino_parallel(run_func, tasks, num_workers=1): df_list = [] with ProcessPoolExecutor(max_workers=num_workers) as executor: @@ -101,11 +127,13 @@ def trino_parallel(run_func, tasks, num_workers=1): else: return None -region='ap-southeast-2' -service = 'trino-service' -worker_service = 'worker-trino-service' -big_worker_service = 'big-worker-trino-service' -cluster = 'my-ecs-cluster' + +region = "ap-southeast-2" +service = "trino-service" +worker_service = "worker-trino-service" +big_worker_service = "big-worker-trino-service" +cluster = "my-ecs-cluster" + def ensure_trino_running(worker_desired_count=1, big_worker_desired_count=0): cmd = f"aws ecs describe-services --cluster {cluster} --services {service} --query services[0].desiredCount --output text" @@ -114,13 +142,21 @@ def ensure_trino_running(worker_desired_count=1, big_worker_desired_count=0): cmd = f"aws ecs describe-services --cluster {cluster} --services {worker_service} --query services[0].desiredCount --output text" - worker_count = sp.run(cmd, shell=True, capture_output=True, text=True).stdout.strip() + worker_count = sp.run( + cmd, shell=True, capture_output=True, text=True + ).stdout.strip() cmd = f"aws ecs describe-services --cluster {cluster} --services {big_worker_service} --query services[0].desiredCount --output text" - big_worker_count = sp.run(cmd, shell=True, capture_output=True, text=True).stdout.strip() + big_worker_count = sp.run( + cmd, shell=True, capture_output=True, text=True + ).stdout.strip() - if trino_count == '0' or worker_count != str(worker_desired_count) or big_worker_count != str(big_worker_desired_count): + if ( + trino_count == "0" + or worker_count != str(worker_desired_count) + or big_worker_count != str(big_worker_desired_count) + ): print("Trino service is not running. Starting the service...") start_cmd = f"aws ecs update-service \ --cluster {cluster} \ @@ -143,7 +179,11 @@ def ensure_trino_running(worker_desired_count=1, big_worker_desired_count=0): sp.run(start_cmd, shell=True, capture_output=True, text=True).stdout.strip() print("Trino service triggered.") - sp.run(f"aws ecs wait services-stable --cluster {cluster} --services {service} {worker_service} {big_worker_service}", shell=True, check=True) + sp.run( + f"aws ecs wait services-stable --cluster {cluster} --services {service} {worker_service} {big_worker_service}", + shell=True, + check=True, + ) print(f"Service {service} is now stable.") else: print("Trino service is already running.") diff --git a/SolA2024_Analysis/Anti-islanding.ipynb b/SolA2024_Analysis/Anti-islanding.ipynb index 8b1a22b..452f8d0 100644 --- a/SolA2024_Analysis/Anti-islanding.ipynb +++ b/SolA2024_Analysis/Anti-islanding.ipynb @@ -7,12 +7,14 @@ "metadata": {}, "outputs": [], "source": [ - "from Data_query.trino_config import *\n", "import json\n", - "import numpy as np\n", + "\n", "import matplotlib.pyplot as plt\n", - "from visualisation import *\n", - "import pytz" + "import numpy as np\n", + "import pytz\n", + "\n", + "from Data_query.trino_config import *\n", + "from visualisation import *" ] }, { @@ -30,7 +32,7 @@ } ], "source": [ - "stop_trino()\n" + "stop_trino()" ] }, { @@ -53,8 +55,8 @@ "big_workers = 0\n", "workers = 1\n", "num_workers = max(workers, big_workers)\n", - "ensure_trino_running(worker_desired_count = workers, big_worker_desired_count=big_workers)\n", - "sleep(120)\n" + "ensure_trino_running(worker_desired_count=workers, big_worker_desired_count=big_workers)\n", + "sleep(120)" ] }, { @@ -182,10 +184,12 @@ } ], "source": [ - "iceberg_sql((\"\"\"select * \n", + "iceberg_sql((\n", + " \"\"\"select * \n", " from conformance_antiisland \n", " where site_id = 625794481 and year=2024 and month=10\n", - " limit 5\"\"\")).head()" + " limit 5\"\"\"\n", + ")).head()" ] }, { @@ -1425,18 +1429,30 @@ " from data2\n", " group by site_id, day, day_night\n", " \"\"\")\n", - " # \n", + " #\n", " sleep(20)\n", - " print(f\"Completed year={year}, month={month}, v_threshold={v_threshold}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\")\n", + " print(\n", + " f\"Completed year={year}, month={month}, v_threshold={v_threshold}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\"\n", + " )\n", " return None\n", "\n", - "tasks = [(year, month, v_threshold, split_cons) for year in (2024, 2025) for month in range(1, 13) for v_threshold in range(260, 266) \n", - " for split_cons in ['system.bucket(postcode, 16) <= 3', '(system.bucket(postcode, 16) > 3 and system.bucket(postcode, 16) <= 7)', \n", - " '(system.bucket(postcode, 16) > 7 and system.bucket(postcode, 16) <= 11)', 'system.bucket(postcode, 16) > 11'] ]\n", - " # for split_cons in [f'system.bucket(postcode, 16) = {i}' for i in range(16)] ]\n", "\n", - " # for split_cons in ['system.bucket(postcode, 16) <= 1'] ]\n", - " \n", + "tasks = [\n", + " (year, month, v_threshold, split_cons)\n", + " for year in (2024, 2025)\n", + " for month in range(1, 13)\n", + " for v_threshold in range(260, 266)\n", + " for split_cons in [\n", + " \"system.bucket(postcode, 16) <= 3\",\n", + " \"(system.bucket(postcode, 16) > 3 and system.bucket(postcode, 16) <= 7)\",\n", + " \"(system.bucket(postcode, 16) > 7 and system.bucket(postcode, 16) <= 11)\",\n", + " \"system.bucket(postcode, 16) > 11\",\n", + " ]\n", + "]\n", + "# for split_cons in [f'system.bucket(postcode, 16) = {i}' for i in range(16)] ]\n", + "\n", + "# for split_cons in ['system.bucket(postcode, 16) <= 1'] ]\n", + "\n", "df = trino_parallel(run_func, tasks, num_workers=num_workers)" ] }, @@ -1648,7 +1664,7 @@ "source": [ "def run_func(args):\n", " year, month, v_threshold, split_cons = args\n", - " df =iceberg_sql(f\"\"\"with data as (\n", + " df = iceberg_sql(f\"\"\"with data as (\n", " select site_id, t_stamp, sum(cast(power*circuit_polarity as decimal(18, 6)))/1000 as P_kW, \n", " max(cast(voltage as decimal(18, 6))) as V, ac_capacity_kw\n", " from \n", @@ -1685,18 +1701,30 @@ " from data2\n", " group by site_id, day, day_night\n", " \"\"\")\n", - " # \n", + " #\n", " sleep(20)\n", - " print(f\"Completed year={year}, month={month}, v_threshold={v_threshold}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\")\n", + " print(\n", + " f\"Completed year={year}, month={month}, v_threshold={v_threshold}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\"\n", + " )\n", " return df\n", "\n", - "tasks = [(year, month, v_threshold, split_cons) for year in (2024, 2025) for month in range(1, 13) for v_threshold in (260, 263) \n", - " for split_cons in ['system.bucket(postcode, 16) <= 3', '(system.bucket(postcode, 16) > 3 and system.bucket(postcode, 16) <= 7)', \n", - " '(system.bucket(postcode, 16) > 7 and system.bucket(postcode, 16) <= 11)', 'system.bucket(postcode, 16) > 11'] ]\n", - " # for split_cons in [f'system.bucket(postcode, 16) = {i}' for i in range(16)] ]\n", "\n", - " # for split_cons in ['system.bucket(postcode, 16) <= 1'] ]\n", - " \n", + "tasks = [\n", + " (year, month, v_threshold, split_cons)\n", + " for year in (2024, 2025)\n", + " for month in range(1, 13)\n", + " for v_threshold in (260, 263)\n", + " for split_cons in [\n", + " \"system.bucket(postcode, 16) <= 3\",\n", + " \"(system.bucket(postcode, 16) > 3 and system.bucket(postcode, 16) <= 7)\",\n", + " \"(system.bucket(postcode, 16) > 7 and system.bucket(postcode, 16) <= 11)\",\n", + " \"system.bucket(postcode, 16) > 11\",\n", + " ]\n", + "]\n", + "# for split_cons in [f'system.bucket(postcode, 16) = {i}' for i in range(16)] ]\n", + "\n", + "# for split_cons in ['system.bucket(postcode, 16) <= 1'] ]\n", + "\n", "df = trino_parallel(run_func, tasks, num_workers=num_workers)" ] }, @@ -1895,7 +1923,7 @@ } ], "source": [ - "df.groupby(['v_threshold']).count().reset_index()" + "df.groupby([\"v_threshold\"]).count().reset_index()" ] }, { @@ -2050,16 +2078,27 @@ " from data2\n", " group by site_id, day, day_night\n", " \"\"\")\n", - " # \n", - " \n", - " print(f\"Completed year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\")\n", + " #\n", + "\n", + " print(\n", + " f\"Completed year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\"\n", + " )\n", " return df\n", "\n", - "tasks = [(year, month, split_cons) for year in (2024, 2025) for month in range(1, 13) \n", - " for split_cons in ['system.bucket(postcode, 16) <= 3', '(system.bucket(postcode, 16) > 3 and system.bucket(postcode, 16) <= 7)', \n", - " '(system.bucket(postcode, 16) > 7 and system.bucket(postcode, 16) <= 11)', 'system.bucket(postcode, 16) > 11'] ]\n", - " # for split_cons in ['system.bucket(postcode, 16) <= 1'] ]\n", - " \n", + "\n", + "tasks = [\n", + " (year, month, split_cons)\n", + " for year in (2024, 2025)\n", + " for month in range(1, 13)\n", + " for split_cons in [\n", + " \"system.bucket(postcode, 16) <= 3\",\n", + " \"(system.bucket(postcode, 16) > 3 and system.bucket(postcode, 16) <= 7)\",\n", + " \"(system.bucket(postcode, 16) > 7 and system.bucket(postcode, 16) <= 11)\",\n", + " \"system.bucket(postcode, 16) > 11\",\n", + " ]\n", + "]\n", + "# for split_cons in ['system.bucket(postcode, 16) <= 1'] ]\n", + "\n", "df2 = trino_parallel(run_func, tasks, num_workers=num_workers)" ] }, @@ -2207,16 +2246,27 @@ " from data2\n", " group by site_id, day, day_night\n", " \"\"\")\n", - " # \n", - " \n", - " print(f\"Completed year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\")\n", + " #\n", + "\n", + " print(\n", + " f\"Completed year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\"\n", + " )\n", " return df\n", "\n", - "tasks = [(year, month, split_cons) for year in (2024, 2025) for month in range(1, 13) \n", - " for split_cons in ['system.bucket(postcode, 16) <= 3', '(system.bucket(postcode, 16) > 3 and system.bucket(postcode, 16) <= 7)', \n", - " '(system.bucket(postcode, 16) > 7 and system.bucket(postcode, 16) <= 11)', 'system.bucket(postcode, 16) > 11'] ]\n", - " # for split_cons in ['system.bucket(postcode, 16) <= 1'] ]\n", - " \n", + "\n", + "tasks = [\n", + " (year, month, split_cons)\n", + " for year in (2024, 2025)\n", + " for month in range(1, 13)\n", + " for split_cons in [\n", + " \"system.bucket(postcode, 16) <= 3\",\n", + " \"(system.bucket(postcode, 16) > 3 and system.bucket(postcode, 16) <= 7)\",\n", + " \"(system.bucket(postcode, 16) > 7 and system.bucket(postcode, 16) <= 11)\",\n", + " \"system.bucket(postcode, 16) > 11\",\n", + " ]\n", + "]\n", + "# for split_cons in ['system.bucket(postcode, 16) <= 1'] ]\n", + "\n", "df1 = trino_parallel(run_func, tasks, num_workers=num_workers)" ] }, @@ -2238,7 +2288,7 @@ } ], "source": [ - "df2.shape, df1.shape, df2['site_id'].nunique(), df1['site_id'].nunique()" + "df2.shape, df1.shape, df2[\"site_id\"].nunique(), df1[\"site_id\"].nunique()" ] }, { @@ -2474,7 +2524,7 @@ } ], "source": [ - "df['site_id'].nunique()" + "df[\"site_id\"].nunique()" ] }, { diff --git a/SolA2024_Analysis/DNSP_OEMs.ipynb b/SolA2024_Analysis/DNSP_OEMs.ipynb index 9f534cf..dca8240 100644 --- a/SolA2024_Analysis/DNSP_OEMs.ipynb +++ b/SolA2024_Analysis/DNSP_OEMs.ipynb @@ -7,10 +7,12 @@ "metadata": {}, "outputs": [], "source": [ - "from Data_query.trino_config import *\n", "import json\n", + "\n", + "import matplotlib.pyplot as plt\n", "import numpy as np\n", - "import matplotlib.pyplot as plt" + "\n", + "from Data_query.trino_config import *" ] }, { @@ -43,7 +45,7 @@ "big_workers = 0\n", "workers = 1\n", "num_workers = max(workers, big_workers)\n", - "ensure_trino_running(worker_desired_count = workers, big_worker_desired_count=big_workers)\n", + "ensure_trino_running(worker_desired_count=workers, big_worker_desired_count=big_workers)\n", "sleep(30)" ] }, @@ -62,7 +64,7 @@ " where manufacturer in ('SolarEdge', 'Fronius', 'SMA', 'GoodWe', 'Solax', 'Sungrow', 'Solis', 'Solaredge', 'Goodwe', 'SolaX', 'SolaX Power')\n", " and dnsp_name in ('Energex', 'Ergon', 'Ausgrid', 'SAPN')\n", " order by OEM\n", - "\"\"\").to_csv('OEM_sites_visible_OEM.csv', index=False)" + "\"\"\").to_csv(\"OEM_sites_visible_OEM.csv\", index=False)" ] }, { diff --git a/SolA2024_Analysis/Flex_export_sites.ipynb b/SolA2024_Analysis/Flex_export_sites.ipynb index df7525e..0417f01 100644 --- a/SolA2024_Analysis/Flex_export_sites.ipynb +++ b/SolA2024_Analysis/Flex_export_sites.ipynb @@ -7,10 +7,11 @@ "metadata": {}, "outputs": [], "source": [ - "from Data_query.trino_config import *\n", "import numpy as np\n", - "from visualisation import *\n", - "import pytz" + "import pytz\n", + "\n", + "from Data_query.trino_config import *\n", + "from visualisation import *" ] }, { @@ -51,7 +52,7 @@ "big_workers = 2\n", "workers = 0\n", "num_workers = max(workers, big_workers)\n", - "ensure_trino_running(worker_desired_count = workers, big_worker_desired_count=big_workers)\n", + "ensure_trino_running(worker_desired_count=workers, big_worker_desired_count=big_workers)\n", "sleep(40)" ] }, @@ -337,8 +338,10 @@ } ], "source": [ - "ensure_trino_running(worker_desired_count = workers, big_worker_desired_count=big_workers)\n", + "ensure_trino_running(worker_desired_count=workers, big_worker_desired_count=big_workers)\n", "sleep(60)\n", + "\n", + "\n", "def run_func(args):\n", " year, month, split_cons = args\n", " df = iceberg_sql(f\"\"\"\n", @@ -397,26 +400,41 @@ " group by b.site_id, date_trunc('day', t_stamp + interval '10' hour) \n", " having min(greatest(P_kw , prev_P_kw , prev_P_kw2, prev_P_kw3, prev_P_kw4) - least(P_kw , prev_P_kw , prev_P_kw2, prev_P_kw3, prev_P_kw4)) < .004\n", " \"\"\")\n", - " flex_export_sites = df['site_id'].unique()\n", - " print(f\"Found {len(flex_export_sites)} flex export sites for year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\")\n", + " flex_export_sites = df[\"site_id\"].unique()\n", + " print(\n", + " f\"Found {len(flex_export_sites)} flex export sites for year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\"\n", + " )\n", " if len(flex_export_sites) == 0:\n", " # print('hi')\n", " return None\n", "\n", - " # sleep(random.randint(15, 30)) # add some randomness to avoid overwhelming trino with simultaneous queries \n", - " print(f\"Completed year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\")\n", + " # sleep(random.randint(15, 30)) # add some randomness to avoid overwhelming trino with simultaneous queries\n", + " print(\n", + " f\"Completed year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\"\n", + " )\n", " # print(df.head(1))\n", " return df\n", - "tasks = [(year, month, split_cons) for year in (2025, 2024) for month in range(1, 13) \n", - " \n", - " for split_cons in ['system.bucket(postcode, 16) <= 3', '(system.bucket(postcode, 16) > 3 and system.bucket(postcode, 16) <= 7)', \n", - " '(system.bucket(postcode, 16) > 7 and system.bucket(postcode, 16) <= 11)', 'system.bucket(postcode, 16) > 11'] ]\n", "\n", - " # for split_cons in [f'system.bucket(postcode, 16) = {i}' for i in range(0, 16)]]\n", - " \n", - " \n", - "try: \n", - " df = trino_parallel_batch(run_func, tasks, num_workers=num_workers, batch_size=num_workers)\n", + "\n", + "tasks = [\n", + " (year, month, split_cons)\n", + " for year in (2025, 2024)\n", + " for month in range(1, 13)\n", + " for split_cons in [\n", + " \"system.bucket(postcode, 16) <= 3\",\n", + " \"(system.bucket(postcode, 16) > 3 and system.bucket(postcode, 16) <= 7)\",\n", + " \"(system.bucket(postcode, 16) > 7 and system.bucket(postcode, 16) <= 11)\",\n", + " \"system.bucket(postcode, 16) > 11\",\n", + " ]\n", + "]\n", + "\n", + "# for split_cons in [f'system.bucket(postcode, 16) = {i}' for i in range(0, 16)]]\n", + "\n", + "\n", + "try:\n", + " df = trino_parallel_batch(\n", + " run_func, tasks, num_workers=num_workers, batch_size=num_workers\n", + " )\n", "except Exception as e:\n", " print(f\"Error during data retrieval: {e}\")\n", "finally:\n", @@ -441,7 +459,7 @@ } ], "source": [ - "ensure_trino_running(worker_desired_count = 1, big_worker_desired_count=0)\n", + "ensure_trino_running(worker_desired_count=1, big_worker_desired_count=0)\n", "sleep(40)" ] }, @@ -463,7 +481,7 @@ } ], "source": [ - "df['site_id'].nunique()" + "df[\"site_id\"].nunique()" ] }, { @@ -527,7 +545,7 @@ " UPDATE meta_up23c\n", " SET flex_export_detected =\n", " CASE \n", - " WHEN site_id IN ({','.join(map(str, df['site_id'].unique()))}) THEN TRUE\n", + " WHEN site_id IN ({\",\".join(map(str, df[\"site_id\"].unique()))}) THEN TRUE\n", " ELSE FALSE\n", " END\n", " \"\"\")" @@ -915,7 +933,7 @@ } ], "source": [ - "iceberg_sql('select * from meta_up23c where flex_export_detected = true')" + "iceberg_sql(\"select * from meta_up23c where flex_export_detected = true\")" ] } ], diff --git a/SolA2024_Analysis/GHI_analysis.ipynb b/SolA2024_Analysis/GHI_analysis.ipynb index dc49c6d..423cbfb 100644 --- a/SolA2024_Analysis/GHI_analysis.ipynb +++ b/SolA2024_Analysis/GHI_analysis.ipynb @@ -7,10 +7,11 @@ "metadata": {}, "outputs": [], "source": [ - "from Data_query.trino_config import *\n", "import numpy as np\n", - "from visualisation import *\n", - "import pytz" + "import pytz\n", + "\n", + "from Data_query.trino_config import *\n", + "from visualisation import *" ] }, { @@ -28,7 +29,7 @@ } ], "source": [ - "stop_trino()\n" + "stop_trino()" ] }, { @@ -51,7 +52,7 @@ "big_workers = 8\n", "workers = 0\n", "num_workers = max(workers, big_workers)\n", - "ensure_trino_running(worker_desired_count = workers, big_worker_desired_count=big_workers)\n", + "ensure_trino_running(worker_desired_count=workers, big_worker_desired_count=big_workers)\n", "sleep(40)" ] }, @@ -395,7 +396,7 @@ "# site_id = 1954814618 # flexible export\n", "def run_func(args):\n", " year, month, split_cons = args\n", - " state='QLD'\n", + " state = \"QLD\"\n", " df = iceberg_sql(f\"\"\"\n", " with data as \n", " (select \n", @@ -452,8 +453,10 @@ " group by b.site_id, date_trunc('day', t_stamp + interval '10' hour) \n", " having min(greatest(P_kw , prev_P_kw , prev_P_kw2, prev_P_kw3, prev_P_kw4) - least(P_kw , prev_P_kw , prev_P_kw2, prev_P_kw3, prev_P_kw4)) < .004\n", " \"\"\")\n", - " flex_export_sites = df['site_id'].unique()\n", - " print(f\"Found {len(flex_export_sites)} flex export sites for year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\")\n", + " flex_export_sites = df[\"site_id\"].unique()\n", + " print(\n", + " f\"Found {len(flex_export_sites)} flex export sites for year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\"\n", + " )\n", " if len(flex_export_sites) == 0:\n", " # print('hi')\n", " return None\n", @@ -463,7 +466,7 @@ " site_id, t_stamp, sum(power*circuit_polarity)/1000 as P_kw,\n", " sum(energy_reactive*circuit_polarity )/1000*12 as Q_kvar, avg(voltage) as v_avg \n", " from ts join \n", - " (select site_id, circuit_id, circuit_polarity from meta_up23c where state='{state}' and is_pv=True and {split_cons} and site_id in ({','.join(map(str, flex_export_sites))}))\n", + " (select site_id, circuit_id, circuit_polarity from meta_up23c where state='{state}' and is_pv=True and {split_cons} and site_id in ({\",\".join(map(str, flex_export_sites))}))\n", " as m on ts.circuit_id = m.circuit_id\n", " where year = {year} and month = {month} and is_pv=True and voltage >= 200 and voltage <= 300 and {split_cons}\n", " group by site_id, t_stamp\n", @@ -472,7 +475,7 @@ " select \n", " distinct time, b.latitude, b.longitude, surface_global_irradiance as GHI, cloud_type\n", " from bom_nci.solar as b\n", - " join (select distinct site_id, n_lat, n_long from meta_up23c where state='{state}' and {split_cons} and site_id in ({','.join(map(str, flex_export_sites))})) as m \n", + " join (select distinct site_id, n_lat, n_long from meta_up23c where state='{state}' and {split_cons} and site_id in ({\",\".join(map(str, flex_export_sites))})) as m \n", " on b.latitude = m.n_lat and b.longitude = m.n_long\n", " where year = {year} and month = {month} and {split_cons} \n", " ),\n", @@ -538,7 +541,7 @@ " ROWS BETWEEN 3 PRECEDING AND 3 FOLLOWING\n", " ) AS P_est\n", " from \n", - " segments as d join (select distinct site_id, n_long, n_lat, ac_capacity_kw, export_limit_kw from meta_up23c where state='{state}' and is_pv=True and {split_cons} and site_id in ({','.join(map(str, flex_export_sites))})) m\n", + " segments as d join (select distinct site_id, n_long, n_lat, ac_capacity_kw, export_limit_kw from meta_up23c where state='{state}' and is_pv=True and {split_cons} and site_id in ({\",\".join(map(str, flex_export_sites))})) m\n", " on d.site_id = m.site_id\n", " join bom5min as b on \n", " m.n_lat = b.latitude and m.n_long = b.longitude and d.t_stamp = b.time_5min\n", @@ -548,21 +551,37 @@ " \n", " \n", " \"\"\")\n", - " print(f\"Completed year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\")\n", + " print(\n", + " f\"Completed year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\"\n", + " )\n", " # sleep(60)\n", " # print(df.head(1))\n", " return df\n", - "tasks = [(year, month, split_cons) for year in (2024, ) for month in range(1, 13) \n", - " for split_cons in ['system.bucket(postcode, 16) <= 3', '(system.bucket(postcode, 16) > 3 and system.bucket(postcode, 16) <= 7)', \n", - " '(system.bucket(postcode, 16) > 7 and system.bucket(postcode, 16) <= 11)', 'system.bucket(postcode, 16) > 11'] ]\n", - " # for split_cons in [f'system.bucket(postcode, 16) = {i}' for i in range(0, 16)]]\n", - " \n", - " \n", - "try: \n", + "\n", + "\n", + "tasks = [\n", + " (year, month, split_cons)\n", + " for year in (2024,)\n", + " for month in range(1, 13)\n", + " for split_cons in [\n", + " \"system.bucket(postcode, 16) <= 3\",\n", + " \"(system.bucket(postcode, 16) > 3 and system.bucket(postcode, 16) <= 7)\",\n", + " \"(system.bucket(postcode, 16) > 7 and system.bucket(postcode, 16) <= 11)\",\n", + " \"system.bucket(postcode, 16) > 11\",\n", + " ]\n", + "]\n", + "# for split_cons in [f'system.bucket(postcode, 16) = {i}' for i in range(0, 16)]]\n", + "\n", + "\n", + "try:\n", " df = trino_parallel(run_func, tasks, num_workers=num_workers)\n", - " df['t_stamp'] = pd.to_datetime(df['t_stamp']).dt.tz_localize('utc').dt.tz_convert(pytz.FixedOffset(10*60))\n", - " df['GHI'] = df['GHI'].fillna(-1)\n", - " df['cloud_type'] = df['cloud_type'].fillna(-1)\n", + " df[\"t_stamp\"] = (\n", + " pd.to_datetime(df[\"t_stamp\"])\n", + " .dt.tz_localize(\"utc\")\n", + " .dt.tz_convert(pytz.FixedOffset(10 * 60))\n", + " )\n", + " df[\"GHI\"] = df[\"GHI\"].fillna(-1)\n", + " df[\"cloud_type\"] = df[\"cloud_type\"].fillna(-1)\n", "except Exception as e:\n", " print(f\"Error during data retrieval: {e}\")\n", "finally:\n", @@ -589,7 +608,7 @@ } ], "source": [ - "df['site_id'].nunique()" + "df[\"site_id\"].nunique()" ] }, { @@ -599,7 +618,7 @@ "metadata": {}, "outputs": [], "source": [ - "id_counter=-1" + "id_counter = -1" ] }, { @@ -619,46 +638,91 @@ } ], "source": [ - "id_counter+=1\n", - "sample_site_id = df.groupby('site_id').size().reset_index()\n", - "sample_site_id = sample_site_id[sample_site_id[0] > 0]['site_id'].tolist()\n", + "id_counter += 1\n", + "sample_site_id = df.groupby(\"site_id\").size().reset_index()\n", + "sample_site_id = sample_site_id[sample_site_id[0] > 0][\"site_id\"].tolist()\n", "df0 = df.query(f\"site_id=={sample_site_id[id_counter]}\").reset_index(drop=True)\n", - "t0 = df0['t_stamp'].min()\n", - "t1 = df0['t_stamp'].max()\n", - "if t0.time() == pd.Timestamp('00:00:00').time():\n", - " start_time = t0.strftime('%Y-%m-%d %H:%M:%S%z')\n", + "t0 = df0[\"t_stamp\"].min()\n", + "t1 = df0[\"t_stamp\"].max()\n", + "if t0.time() == pd.Timestamp(\"00:00:00\").time():\n", + " start_time = t0.strftime(\"%Y-%m-%d %H:%M:%S%z\")\n", "else:\n", - " start_time = (t0 + pd.Timedelta(days=1)).replace(hour=0, minute=0, second=0).strftime('%Y-%m-%d %H:%M:%S%z')\n", - "if t1.time() == pd.Timestamp('00:00:00').time():\n", - " end_time = t1.strftime('%Y-%m-%d %H:%M:%S%z')\n", + " start_time = (\n", + " (t0 + pd.Timedelta(days=1))\n", + " .replace(hour=0, minute=0, second=0)\n", + " .strftime(\"%Y-%m-%d %H:%M:%S%z\")\n", + " )\n", + "if t1.time() == pd.Timestamp(\"00:00:00\").time():\n", + " end_time = t1.strftime(\"%Y-%m-%d %H:%M:%S%z\")\n", "else:\n", - " end_time = t1.replace(hour=0, minute=0, second=0).strftime('%Y-%m-%d %H:%M:%S%z')\n", + " end_time = t1.replace(hour=0, minute=0, second=0).strftime(\"%Y-%m-%d %H:%M:%S%z\")\n", "# start_time = df0['t_stamp'].min().strftime('%Y-%m-%d %H:%M:%S%z')\t # In sydney local time\n", "\n", "\n", - "num_ticks = 24*2+1\n", - "save_as = 'Figures/Test.jpeg'\n", - "x_label = 'time'\n", - "y_labels = ['GHI', 'Cloud Type', 'Active Power (kW)', 'Active Power (kW)','Reactive Power (kVAR)']\n", + "num_ticks = 24 * 2 + 1\n", + "save_as = \"Figures/Test.jpeg\"\n", + "x_label = \"time\"\n", + "y_labels = [\n", + " \"GHI\",\n", + " \"Cloud Type\",\n", + " \"Active Power (kW)\",\n", + " \"Active Power (kW)\",\n", + " \"Reactive Power (kVAR)\",\n", + "]\n", "# y_labels = ['GHI', 'Cloud Type', 'Apparent Power (kVA)', 'Average Voltage (V)', 'Power Factor']\n", - "plt_config = {'GHI': [0, 0, '-', None, None], 'cloud_type': [0, 1, '-', None, None],\n", - "'P_kw': [1, 0, '-', None, None], 'P_est': [1, 0, '-', None, None], 'Q_kvar': [1, 1, '-', None, None]}\n", + "plt_config = {\n", + " \"GHI\": [0, 0, \"-\", None, None],\n", + " \"cloud_type\": [0, 1, \"-\", None, None],\n", + " \"P_kw\": [1, 0, \"-\", None, None],\n", + " \"P_est\": [1, 0, \"-\", None, None],\n", + " \"Q_kvar\": [1, 1, \"-\", None, None],\n", + "}\n", "# 'S_kva': [1, 0, '-', None, None], 'v_avg': [1, 1, '-', None, None], 'pf': [2, 0, '-', None, None]}\n", "\n", - "color_nights=False\n", + "color_nights = False\n", "# color_by = 'group'\n", - "color_by = 'attribute'\n", - "ax_digit = '1.1f'\n", - "a=my_plot4(start_time, end_time, df0, plt_config=plt_config, ax_digit= ax_digit,\n", - " group_attr='site_id', time_attr='t_stamp', color_nights=color_nights,cmap='plasma',\n", - " figsize=[16/2.54,1.3], same_scale=1, fontsize=5, fontname='DejaVu Sans', plot_total=False, plot_total_func=['sum', [lambda x: max(x), 'max']], \n", - " num_ticks=num_ticks, num_yticks=10, dpi=200, x_format= '%H:%M', \n", - " legend_loc=['upper left', 'upper right', 'center left', 'upper right'], x_label=x_label, y_labels=y_labels, color_by=color_by,\n", - "plot_period=np.timedelta64(1, 'D'), save_as=save_as, rotation = 60, step=0, gridwidth=[0.2, .2], legend_join='-', title='', \n", - "legend_i=0, title_i=0, only1title=0, onlyntime=0, show=False)\n", + "color_by = \"attribute\"\n", + "ax_digit = \"1.1f\"\n", + "a = my_plot4(\n", + " start_time,\n", + " end_time,\n", + " df0,\n", + " plt_config=plt_config,\n", + " ax_digit=ax_digit,\n", + " group_attr=\"site_id\",\n", + " time_attr=\"t_stamp\",\n", + " color_nights=color_nights,\n", + " cmap=\"plasma\",\n", + " figsize=[16 / 2.54, 1.3],\n", + " same_scale=1,\n", + " fontsize=5,\n", + " fontname=\"DejaVu Sans\",\n", + " plot_total=False,\n", + " plot_total_func=[\"sum\", [lambda x: max(x), \"max\"]],\n", + " num_ticks=num_ticks,\n", + " num_yticks=10,\n", + " dpi=200,\n", + " x_format=\"%H:%M\",\n", + " legend_loc=[\"upper left\", \"upper right\", \"center left\", \"upper right\"],\n", + " x_label=x_label,\n", + " y_labels=y_labels,\n", + " color_by=color_by,\n", + " plot_period=np.timedelta64(1, \"D\"),\n", + " save_as=save_as,\n", + " rotation=60,\n", + " step=0,\n", + " gridwidth=[0.2, 0.2],\n", + " legend_join=\"-\",\n", + " title=\"\",\n", + " legend_i=0,\n", + " title_i=0,\n", + " only1title=0,\n", + " onlyntime=0,\n", + " show=False,\n", + ")\n", "a.do()\n", - "print(df0['ac_capacity_kw'].unique())\n", - "print(df0['export_limit_kw'].unique())\n", + "print(df0[\"ac_capacity_kw\"].unique())\n", + "print(df0[\"export_limit_kw\"].unique())\n", "# df['t_stamp'].dt.date.nunique()" ] }, @@ -701,9 +765,15 @@ " \"\"\")\n", " sleep(2)\n", " return df\n", - "tasks = [(year, month, split_cons) for year in (2024, ) for month in range(11, 12) \n", - " for split_cons in [f'system.bucket(postcode, 16) = {i}' for i in range(1)]]\n", - "try: \n", + "\n", + "\n", + "tasks = [\n", + " (year, month, split_cons)\n", + " for year in (2024,)\n", + " for month in range(11, 12)\n", + " for split_cons in [f\"system.bucket(postcode, 16) = {i}\" for i in range(1)]\n", + "]\n", + "try:\n", " df = trino_parallel(run_func, tasks, num_workers=num_workers)\n", "except Exception as e:\n", " print(f\"Error during data retrieval: {e}\")\n", @@ -742,8 +812,8 @@ "source": [ "## count the number of timestamps per site when voltvar is not required with GHI > 0\n", "def run_func(args):\n", - " year, month = args\n", - " df = iceberg_sql(f\"\"\"with data as (select site_id, t_stamp, n_lat, n_long\n", + " year, month = args\n", + " df = iceberg_sql(f\"\"\"with data as (select site_id, t_stamp, n_lat, n_long\n", " from ts inner join meta_up23c as m on ts.circuit_id = m.circuit_id\n", " where year = {year} and month = {month} and ts.is_pv=True\n", " group by t_stamp, site_id, n_lat, n_long\n", @@ -773,10 +843,12 @@ " order by num_data\n", " \n", " \"\"\")\n", - " sleep(10)\n", - " print(f\"Completed year={year}, month={month}\")\n", - " return df\n", - "tasks = [(year, month) for year in (2024, ) for month in range(1, 13) ] \n", + " sleep(10)\n", + " print(f\"Completed year={year}, month={month}\")\n", + " return df\n", + "\n", + "\n", + "tasks = [(year, month) for year in (2024,) for month in range(1, 13)]\n", "df_vlim = trino_parallel(run_func, tasks, num_workers=num_workers)" ] }, @@ -808,8 +880,8 @@ "source": [ "## count the number of timestamps per site when for all voltage range with GHI > 0\n", "def run_func(args):\n", - " year, month = args\n", - " df = iceberg_sql(f\"\"\"with data as (select site_id, t_stamp, n_lat, n_long\n", + " year, month = args\n", + " df = iceberg_sql(f\"\"\"with data as (select site_id, t_stamp, n_lat, n_long\n", " from ts inner join meta_up23c as m on ts.circuit_id = m.circuit_id\n", " where year = {year} and month = {month} and ts.is_pv=True\n", " group by t_stamp, site_id, n_lat, n_long),\n", @@ -838,10 +910,12 @@ " order by num_data\n", " \n", " \"\"\")\n", - " sleep(10)\n", - " print(f\"Completed year={year}, month={month}\")\n", - " return df\n", - "tasks = [(year, month) for year in (2024, ) for month in range(1, 13) ] \n", + " sleep(10)\n", + " print(f\"Completed year={year}, month={month}\")\n", + " return df\n", + "\n", + "\n", + "tasks = [(year, month) for year in (2024,) for month in range(1, 13)]\n", "df_vall = trino_parallel(run_func, tasks, num_workers=num_workers)" ] }, @@ -852,7 +926,7 @@ "metadata": {}, "outputs": [], "source": [ - "df1 = df_vlim.merge(df_vall, on='site_id', suffixes=('_vlim', '_vall'))" + "df1 = df_vlim.merge(df_vall, on=\"site_id\", suffixes=(\"_vlim\", \"_vall\"))" ] }, { @@ -873,7 +947,7 @@ } ], "source": [ - "df1.query(\"num_data_vlim < 10 and num_data_vall > 500\")['site_id'].nunique()" + "df1.query(\"num_data_vlim < 10 and num_data_vall > 500\")[\"site_id\"].nunique()" ] }, { @@ -1124,9 +1198,13 @@ "\"\"\")\n", "# having count(cloud_type=0) > 10\n", "# having sum(cloud_type) = 0\n", - "df['t_stamp'] = pd.to_datetime(df['t_stamp']).dt.tz_localize('utc').dt.tz_convert(pytz.FixedOffset(10*60))\n", - "df['GHI'] = df['GHI'].fillna(-1)\n", - "df['cloud_type'] = df['cloud_type'].fillna(-1)\n", + "df[\"t_stamp\"] = (\n", + " pd.to_datetime(df[\"t_stamp\"])\n", + " .dt.tz_localize(\"utc\")\n", + " .dt.tz_convert(pytz.FixedOffset(10 * 60))\n", + ")\n", + "df[\"GHI\"] = df[\"GHI\"].fillna(-1)\n", + "df[\"cloud_type\"] = df[\"cloud_type\"].fillna(-1)\n", "df[:2]" ] }, @@ -1140,7 +1218,7 @@ "iceberg_sql(\"\"\"\n", " select distinct site_id, state, longitude, latitude\n", " from meta_up23c \n", - " \"\"\").to_csv('site_locations.csv', index=False)" + " \"\"\").to_csv(\"site_locations.csv\", index=False)" ] }, { @@ -1404,14 +1482,14 @@ } ], "source": [ - " # where avg_pf < .75 and ac_capacity_kw >= 1 and min_time < timestamp '2024-05-01 00:00:00'\n", + "# where avg_pf < .75 and ac_capacity_kw >= 1 and min_time < timestamp '2024-05-01 00:00:00'\n", "\n", "iceberg_sql(\"\"\"\n", " select distinct site_id, state, ac_capacity_kw, export_limit_kw, min_time, max_time, longitude, latitude, distance_km, pv_install_date,pf_01, avg_pf\n", " from meta_up23c \n", " where pf_01 > .95 and state='NSW' and ac_capacity_kw <= 15 and min_time < timestamp '2024-02-01 00:00:00' and is_pv=True\n", " order by pf_01 \n", - " \"\"\")\n" + " \"\"\")" ] }, { @@ -1596,7 +1674,9 @@ "# month in ({','.join(map(str, months))})\n", "months = [1, 2, 3, 4, 5, 6]\n", "# site_id in ({','.join(map(str, sample_site_id))})\n", - "state='QLD'\n", + "state = \"QLD\"\n", + "\n", + "\n", "def run_func(args):\n", " year, month, split_cons = args\n", " df = iceberg_sql(f\"\"\"\n", @@ -1670,8 +1750,10 @@ " AND prev_P_kw4 IS NOT NULL\n", " and greatest(P_kw , prev_P_kw , prev_P_kw2, prev_P_kw3, prev_P_kw4) - least(P_kw , prev_P_kw , prev_P_kw2, prev_P_kw3, prev_P_kw4) < .004\n", " \"\"\")\n", - " flex_export_sites = df['site_id'].unique()\n", - " print(f\"Found {len(flex_export_sites)} flex export sites for year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\")\n", + " flex_export_sites = df[\"site_id\"].unique()\n", + " print(\n", + " f\"Found {len(flex_export_sites)} flex export sites for year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\"\n", + " )\n", " sleep(10)\n", " df = iceberg_sql(f\"\"\"\n", " with data as \n", @@ -1679,7 +1761,7 @@ " site_id, t_stamp, sum(cast(power*circuit_polarity as decimal(38, 6)))/1000 as P_kw,\n", " sum(cast(energy_reactive*circuit_polarity as decimal(38, 6)))/1000*12 as Q_kvar, avg(voltage) as v_avg \n", " from ts join \n", - " (select site_id, circuit_id, circuit_polarity from meta_up23c where state='{state}' and is_pv=True and {split_cons} and site_id in ({','.join(map(str, flex_export_sites))}))\n", + " (select site_id, circuit_id, circuit_polarity from meta_up23c where state='{state}' and is_pv=True and {split_cons} and site_id in ({\",\".join(map(str, flex_export_sites))}))\n", " as m on ts.circuit_id = m.circuit_id\n", " where year = {year} and month = {month} and is_pv=True and voltage >= 200 and voltage <= 300 and {split_cons}\n", " group by site_id, t_stamp\n", @@ -1688,7 +1770,7 @@ " select \n", " distinct time, b.latitude, b.longitude, surface_global_irradiance as GHI, cloud_type\n", " from bom_nci.solar as b\n", - " join (select distinct site_id, n_lat, n_long from meta_up23c where state='{state}' and {split_cons} and site_id in ({','.join(map(str, flex_export_sites))})) as m \n", + " join (select distinct site_id, n_lat, n_long from meta_up23c where state='{state}' and {split_cons} and site_id in ({\",\".join(map(str, flex_export_sites))})) as m \n", " on b.latitude = m.n_lat and b.longitude = m.n_long\n", " where year = {year} and month = {month} and {split_cons} \n", " ),\n", @@ -1803,19 +1885,35 @@ " \n", " \"\"\")\n", " sleep(10)\n", - " print(f\"Completed year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\")\n", + " print(\n", + " f\"Completed year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\"\n", + " )\n", " return df\n", - "tasks = [(year, month, split_cons) for year in (2024, ) for month in range(11, 12) \n", - " for split_cons in ['system.bucket(postcode, 16) <= 3', '(system.bucket(postcode, 16) > 3 and system.bucket(postcode, 16) <= 7)', \n", - " '(system.bucket(postcode, 16) > 7 and system.bucket(postcode, 16) <= 11)', 'system.bucket(postcode, 16) > 11'] ]\n", - " # for split_cons in [f'system.bucket(postcode, 16) = {i}' for i in range(0, 1)]]\n", - " \n", - " \n", - "try: \n", + "\n", + "\n", + "tasks = [\n", + " (year, month, split_cons)\n", + " for year in (2024,)\n", + " for month in range(11, 12)\n", + " for split_cons in [\n", + " \"system.bucket(postcode, 16) <= 3\",\n", + " \"(system.bucket(postcode, 16) > 3 and system.bucket(postcode, 16) <= 7)\",\n", + " \"(system.bucket(postcode, 16) > 7 and system.bucket(postcode, 16) <= 11)\",\n", + " \"system.bucket(postcode, 16) > 11\",\n", + " ]\n", + "]\n", + "# for split_cons in [f'system.bucket(postcode, 16) = {i}' for i in range(0, 1)]]\n", + "\n", + "\n", + "try:\n", " df = trino_parallel(run_func, tasks, num_workers=num_workers)\n", - " df['t_stamp'] = pd.to_datetime(df['t_stamp']).dt.tz_localize('utc').dt.tz_convert(pytz.FixedOffset(10*60))\n", - " df['GHI'] = df['GHI'].fillna(-1)\n", - " df['cloud_type'] = df['cloud_type'].fillna(-1)\n", + " df[\"t_stamp\"] = (\n", + " pd.to_datetime(df[\"t_stamp\"])\n", + " .dt.tz_localize(\"utc\")\n", + " .dt.tz_convert(pytz.FixedOffset(10 * 60))\n", + " )\n", + " df[\"GHI\"] = df[\"GHI\"].fillna(-1)\n", + " df[\"cloud_type\"] = df[\"cloud_type\"].fillna(-1)\n", "except Exception as e:\n", " print(f\"Error during data retrieval: {e}\")\n", "finally:\n", @@ -1964,7 +2062,7 @@ } ], "source": [ - "iceberg_sql('select * from meta_up23c where site_id = 1898602106')" + "iceberg_sql(\"select * from meta_up23c where site_id = 1898602106\")" ] }, { @@ -1993,45 +2091,90 @@ } ], "source": [ - "id_counter+=1\n", - "sample_site_id = df.groupby('site_id').size().reset_index()\n", - "sample_site_id = sample_site_id[sample_site_id[0] > 0]['site_id'].tolist()\n", + "id_counter += 1\n", + "sample_site_id = df.groupby(\"site_id\").size().reset_index()\n", + "sample_site_id = sample_site_id[sample_site_id[0] > 0][\"site_id\"].tolist()\n", "df0 = df.query(f\"site_id=={sample_site_id[id_counter]}\").reset_index(drop=True)\n", - "t0 = df0['t_stamp'].min()\n", - "t1 = df0['t_stamp'].max()\n", - "if t0.time() == pd.Timestamp('00:00:00').time():\n", - " start_time = t0.strftime('%Y-%m-%d %H:%M:%S%z')\n", + "t0 = df0[\"t_stamp\"].min()\n", + "t1 = df0[\"t_stamp\"].max()\n", + "if t0.time() == pd.Timestamp(\"00:00:00\").time():\n", + " start_time = t0.strftime(\"%Y-%m-%d %H:%M:%S%z\")\n", "else:\n", - " start_time = (t0 + pd.Timedelta(days=1)).replace(hour=0, minute=0, second=0).strftime('%Y-%m-%d %H:%M:%S%z')\n", - "if t1.time() == pd.Timestamp('00:00:00').time():\n", - " end_time = t1.strftime('%Y-%m-%d %H:%M:%S%z')\n", + " start_time = (\n", + " (t0 + pd.Timedelta(days=1))\n", + " .replace(hour=0, minute=0, second=0)\n", + " .strftime(\"%Y-%m-%d %H:%M:%S%z\")\n", + " )\n", + "if t1.time() == pd.Timestamp(\"00:00:00\").time():\n", + " end_time = t1.strftime(\"%Y-%m-%d %H:%M:%S%z\")\n", "else:\n", - " end_time = t1.replace(hour=0, minute=0, second=0).strftime('%Y-%m-%d %H:%M:%S%z')\n", + " end_time = t1.replace(hour=0, minute=0, second=0).strftime(\"%Y-%m-%d %H:%M:%S%z\")\n", "# start_time = df0['t_stamp'].min().strftime('%Y-%m-%d %H:%M:%S%z')\t # In sydney local time\n", "\n", "\n", - "num_ticks = 24*2+1\n", - "save_as = 'Figures/Test.jpeg'\n", - "x_label = 'time'\n", - "y_labels = ['GHI', 'Cloud Type', 'Active Power (kW)', 'Active Power (kW)','Reactive Power (kVAR)']\n", + "num_ticks = 24 * 2 + 1\n", + "save_as = \"Figures/Test.jpeg\"\n", + "x_label = \"time\"\n", + "y_labels = [\n", + " \"GHI\",\n", + " \"Cloud Type\",\n", + " \"Active Power (kW)\",\n", + " \"Active Power (kW)\",\n", + " \"Reactive Power (kVAR)\",\n", + "]\n", "# y_labels = ['GHI', 'Cloud Type', 'Apparent Power (kVA)', 'Average Voltage (V)', 'Power Factor']\n", - "plt_config = {'GHI': [0, 0, '-', None, None], 'cloud_type': [0, 1, '-', None, None],\n", - "'P_kw': [1, 0, '-', None, None], 'P_est': [1, 0, '-', None, None], 'Q_kvar': [1, 1, '-', None, None]}\n", + "plt_config = {\n", + " \"GHI\": [0, 0, \"-\", None, None],\n", + " \"cloud_type\": [0, 1, \"-\", None, None],\n", + " \"P_kw\": [1, 0, \"-\", None, None],\n", + " \"P_est\": [1, 0, \"-\", None, None],\n", + " \"Q_kvar\": [1, 1, \"-\", None, None],\n", + "}\n", "# 'S_kva': [1, 0, '-', None, None], 'v_avg': [1, 1, '-', None, None], 'pf': [2, 0, '-', None, None]}\n", "\n", - "color_nights=False\n", + "color_nights = False\n", "# color_by = 'group'\n", - "color_by = 'attribute'\n", - "ax_digit = '1.1f'\n", - "a=my_plot4(start_time, end_time, df0, plt_config=plt_config, ax_digit= ax_digit,\n", - " group_attr='site_id', time_attr='t_stamp', color_nights=color_nights,cmap='plasma',\n", - " figsize=[16/2.54,1.3], same_scale=1, fontsize=5, fontname='DejaVu Sans', plot_total=False, plot_total_func=['sum', [lambda x: max(x), 'max']], \n", - " num_ticks=num_ticks, num_yticks=10, dpi=200, x_format= '%H:%M', \n", - " legend_loc=['upper left', 'upper right', 'center left', 'upper right'], x_label=x_label, y_labels=y_labels, color_by=color_by,\n", - "plot_period=np.timedelta64(1, 'D'), save_as=save_as, rotation = 60, step=0, gridwidth=[0.2, .2], legend_join='-', title='', \n", - "legend_i=0, title_i=0, only1title=0, onlyntime=0, show=False)\n", + "color_by = \"attribute\"\n", + "ax_digit = \"1.1f\"\n", + "a = my_plot4(\n", + " start_time,\n", + " end_time,\n", + " df0,\n", + " plt_config=plt_config,\n", + " ax_digit=ax_digit,\n", + " group_attr=\"site_id\",\n", + " time_attr=\"t_stamp\",\n", + " color_nights=color_nights,\n", + " cmap=\"plasma\",\n", + " figsize=[16 / 2.54, 1.3],\n", + " same_scale=1,\n", + " fontsize=5,\n", + " fontname=\"DejaVu Sans\",\n", + " plot_total=False,\n", + " plot_total_func=[\"sum\", [lambda x: max(x), \"max\"]],\n", + " num_ticks=num_ticks,\n", + " num_yticks=10,\n", + " dpi=200,\n", + " x_format=\"%H:%M\",\n", + " legend_loc=[\"upper left\", \"upper right\", \"center left\", \"upper right\"],\n", + " x_label=x_label,\n", + " y_labels=y_labels,\n", + " color_by=color_by,\n", + " plot_period=np.timedelta64(1, \"D\"),\n", + " save_as=save_as,\n", + " rotation=60,\n", + " step=0,\n", + " gridwidth=[0.2, 0.2],\n", + " legend_join=\"-\",\n", + " title=\"\",\n", + " legend_i=0,\n", + " title_i=0,\n", + " only1title=0,\n", + " onlyntime=0,\n", + " show=False,\n", + ")\n", "a.do()\n", - "print(df0['ac_capacity_kw'].unique())\n", + "print(df0[\"ac_capacity_kw\"].unique())\n", "# df['t_stamp'].dt.date.nunique()" ] }, @@ -2053,7 +2196,7 @@ } ], "source": [ - "df['site_id'].nunique()" + "df[\"site_id\"].nunique()" ] }, { @@ -2791,11 +2934,40 @@ ], "source": [ "df2 = df0.copy()\n", - "df2['flatness'] = df2.apply(lambda row:\n", - " (max(row['P_kw'], row['prev_P_kw'], row['prev_P_kw2'], row['prev_P_kw3'], row['prev_P_kw4']) - \n", - " min(row['P_kw'], row['prev_P_kw'], row['prev_P_kw2'], row['prev_P_kw3'], row['prev_P_kw4'])), axis=1)\n", - "df2.loc[(df2['flatness'] < 0.004) & (df2['t_stamp'] > '2024-01-21') & df2['prev_P_kw4'].notnull() & (df2['P_kw'] > 0.2*df2['ac_capacity_kw']) , \n", - " ['t_stamp', 'P_kw', 'prev_P_kw', 'prev_P_kw2', 'prev_P_kw3', 'prev_P_kw4', 'flatness']]" + "df2[\"flatness\"] = df2.apply(\n", + " lambda row: (\n", + " max(\n", + " row[\"P_kw\"],\n", + " row[\"prev_P_kw\"],\n", + " row[\"prev_P_kw2\"],\n", + " row[\"prev_P_kw3\"],\n", + " row[\"prev_P_kw4\"],\n", + " )\n", + " - min(\n", + " row[\"P_kw\"],\n", + " row[\"prev_P_kw\"],\n", + " row[\"prev_P_kw2\"],\n", + " row[\"prev_P_kw3\"],\n", + " row[\"prev_P_kw4\"],\n", + " )\n", + " ),\n", + " axis=1,\n", + ")\n", + "df2.loc[\n", + " (df2[\"flatness\"] < 0.004)\n", + " & (df2[\"t_stamp\"] > \"2024-01-21\")\n", + " & df2[\"prev_P_kw4\"].notnull()\n", + " & (df2[\"P_kw\"] > 0.2 * df2[\"ac_capacity_kw\"]),\n", + " [\n", + " \"t_stamp\",\n", + " \"P_kw\",\n", + " \"prev_P_kw\",\n", + " \"prev_P_kw2\",\n", + " \"prev_P_kw3\",\n", + " \"prev_P_kw4\",\n", + " \"flatness\",\n", + " ],\n", + "]" ] }, { @@ -3169,13 +3341,13 @@ "df = iceberg_sql(f\"\"\"with data as (select site_id, t_stamp, sum(cast(power*circuit_polarity as decimal(38, 6)))/1000 as P_kw, \n", " sum(cast(energy_reactive*circuit_polarity as decimal(38, 6)))/1000*12 as Q_kvar, avg(voltage) as v_avg, n_lat, n_long\n", " from ts inner join meta_up23c as m on ts.circuit_id = m.circuit_id\n", - " where site_id in ({','.join(map(str, sample_site_id))}) and ts.is_pv=True and voltage >= 200 and voltage <= 300\n", + " where site_id in ({\",\".join(map(str, sample_site_id))}) and ts.is_pv=True and voltage >= 200 and voltage <= 300\n", " group by t_stamp, site_id, n_lat, n_long),\n", "\n", " bom10min as (select distinct time, b.latitude, b.longitude, surface_global_irradiance as GHI, cloud_type\n", " from bom_nci.solar as b\n", " inner join meta_up23c as m on b.latitude = m.n_lat and b.longitude = m.n_long\n", - " where site_id in ({','.join(map(str, sample_site_id))})\n", + " where site_id in ({\",\".join(map(str, sample_site_id))})\n", " ),\n", "\n", " bom5min as ((select time as time_5min, latitude, longitude, GHI, cloud_type\n", @@ -3217,9 +3389,13 @@ "\n", "# having count(cloud_type=0) > 10\n", "# having sum(cloud_type) = 0\n", - "df['t_stamp'] = pd.to_datetime(df['t_stamp']).dt.tz_localize('utc').dt.tz_convert(pytz.FixedOffset(10*60))\n", - "df['GHI'] = df['GHI'].fillna(-1)\n", - "df['cloud_type'] = df['cloud_type'].fillna(-1)\n", + "df[\"t_stamp\"] = (\n", + " pd.to_datetime(df[\"t_stamp\"])\n", + " .dt.tz_localize(\"utc\")\n", + " .dt.tz_convert(pytz.FixedOffset(10 * 60))\n", + ")\n", + "df[\"GHI\"] = df[\"GHI\"].fillna(-1)\n", + "df[\"cloud_type\"] = df[\"cloud_type\"].fillna(-1)\n", "df[:2]" ] }, @@ -3230,11 +3406,11 @@ "metadata": {}, "outputs": [], "source": [ - "station_temp = pd.read_parquet('SolA2024_Analysis/station_temp_2024.parquet')\n", - "site_station = pd.read_parquet('SolA2024_Analysis/site_station_mapping.parquet')\n", - "site_station['station_1'] = site_station['station_1'].astype('int64')\n", - "site_station['station_2'] = site_station['station_2'].astype('int64')\n", - "site_station['station_3'] = site_station['station_3'].astype('int64')" + "station_temp = pd.read_parquet(\"SolA2024_Analysis/station_temp_2024.parquet\")\n", + "site_station = pd.read_parquet(\"SolA2024_Analysis/site_station_mapping.parquet\")\n", + "site_station[\"station_1\"] = site_station[\"station_1\"].astype(\"int64\")\n", + "site_station[\"station_2\"] = site_station[\"station_2\"].astype(\"int64\")\n", + "site_station[\"station_3\"] = site_station[\"station_3\"].astype(\"int64\")" ] }, { @@ -3244,11 +3420,13 @@ "metadata": {}, "outputs": [], "source": [ - "df_max_GHI = df.groupby(['site_id', df['t_stamp'].dt.date]).agg(\n", - " max_GHI=('GHI', 'max'),\n", - " max_P_kw = ('P_kw', 'max')\n", - ").reset_index().rename(columns={'t_stamp': 'day'})\n", - "df_max_GHI['day'] = pd.to_datetime(df_max_GHI['day'])" + "df_max_GHI = (\n", + " df.groupby([\"site_id\", df[\"t_stamp\"].dt.date])\n", + " .agg(max_GHI=(\"GHI\", \"max\"), max_P_kw=(\"P_kw\", \"max\"))\n", + " .reset_index()\n", + " .rename(columns={\"t_stamp\": \"day\"})\n", + ")\n", + "df_max_GHI[\"day\"] = pd.to_datetime(df_max_GHI[\"day\"])" ] }, { @@ -3260,27 +3438,26 @@ "source": [ "# df_m = df_max_GHI.merge(site_station, on='site_id').merge(station_temp, on=['station_number', 'day'], how='left')[['day', 'max_P_kw', 'max_GHI', 'temp', 'site_id', 'station_number']]\n", "df_m = (\n", - " df_max_GHI\n", - " .merge(site_station, on='site_id')\n", + " df_max_GHI.merge(site_station, on=\"site_id\")\n", " .merge(\n", - " station_temp.rename(columns={'station_number': 'station_1', 'temp': 'temp_1'}),\n", - " on=['station_1', 'day'],\n", - " how='left'\n", + " station_temp.rename(columns={\"station_number\": \"station_1\", \"temp\": \"temp_1\"}),\n", + " on=[\"station_1\", \"day\"],\n", + " how=\"left\",\n", " )\n", " .merge(\n", - " station_temp.rename(columns={'station_number': 'station_2', 'temp': 'temp_2'}),\n", - " on=['station_2', 'day'],\n", - " how='left'\n", + " station_temp.rename(columns={\"station_number\": \"station_2\", \"temp\": \"temp_2\"}),\n", + " on=[\"station_2\", \"day\"],\n", + " how=\"left\",\n", " )\n", " .merge(\n", - " station_temp.rename(columns={'station_number': 'station_3', 'temp': 'temp_3'}),\n", - " on=['station_3', 'day'],\n", - " how='left'\n", + " station_temp.rename(columns={\"station_number\": \"station_3\", \"temp\": \"temp_3\"}),\n", + " on=[\"station_3\", \"day\"],\n", + " how=\"left\",\n", " )\n", ")\n", - "df_m['temp'] = df_m[['temp_1', 'temp_2', 'temp_3']].bfill(axis=1).iloc[:, 0]\n", - "df_m = df_m[['day', 'max_P_kw', 'max_GHI', 'temp', 'site_id']]\n", - "df_m['GHI_T'] = df_m['max_GHI'] * df_m['temp']\n", + "df_m[\"temp\"] = df_m[[\"temp_1\", \"temp_2\", \"temp_3\"]].bfill(axis=1).iloc[:, 0]\n", + "df_m = df_m[[\"day\", \"max_P_kw\", \"max_GHI\", \"temp\", \"site_id\"]]\n", + "df_m[\"GHI_T\"] = df_m[\"max_GHI\"] * df_m[\"temp\"]\n", "# df_m['GHI_T'] = df_m['GHI_T'] * df_m['max_GHI']\n", "# df_m['GHI_T'] = df_m['GHI_T'] * df_m['max_GHI']\n", "# df_m['GHI_T'] = df_m['GHI_T'] * df_m['temp']\n", @@ -3308,18 +3485,18 @@ "\n", "# X = df_m[['max_GHI']]\n", "# X = df_m[['GHI_T', 'max_GHI']]\n", - "X = df_m[['GHI_T', 'max_GHI']]\n", - "y = df_m['max_P_kw']\n", + "X = df_m[[\"GHI_T\", \"max_GHI\"]]\n", + "y = df_m[\"max_P_kw\"]\n", "\n", "model = LinearRegression()\n", "model.fit(X, y)\n", "\n", "# coefficients\n", - "print('Intercept:', model.intercept_)\n", - "print('Coefficients:', dict(zip(X.columns, model.coef_)))\n", + "print(\"Intercept:\", model.intercept_)\n", + "print(\"Coefficients:\", dict(zip(X.columns, model.coef_)))\n", "\n", "# predictions\n", - "df_m['max_P_kw_pred'] = model.predict(X)" + "df_m[\"max_P_kw_pred\"] = model.predict(X)" ] }, { @@ -3348,11 +3525,11 @@ ], "source": [ "plt.figure(figsize=(6, 4))\n", - "plt.plot(df_m['day'], df_m['max_P_kw_pred'], 'r', label='Predicted')\n", - "plt.plot(df_m['day'], df_m['max_P_kw'], 'b', label='Actual')\n", - "print('R^2:', model.score(X, y), (df_m['max_P_kw_pred'] - df_m['max_P_kw']).abs().sum())\n", + "plt.plot(df_m[\"day\"], df_m[\"max_P_kw_pred\"], \"r\", label=\"Predicted\")\n", + "plt.plot(df_m[\"day\"], df_m[\"max_P_kw\"], \"b\", label=\"Actual\")\n", + "print(\"R^2:\", model.score(X, y), (df_m[\"max_P_kw_pred\"] - df_m[\"max_P_kw\"]).abs().sum())\n", "plt.legend()\n", - "plt.show()\n" + "plt.show()" ] }, { @@ -3371,9 +3548,9 @@ ], "source": [ "plt.figure(figsize=(6, 4))\n", - "plt.plot(df_m['day'], df_m['max_P_kw_pred'])\n", - "plt.plot(df_m['day'], df_m['max_P_kw'])\n", - "print('R^2:', model.score(X, y), (df_m['max_P_kw_pred'] - df_m['max_P_kw']).abs().sum())" + "plt.plot(df_m[\"day\"], df_m[\"max_P_kw_pred\"])\n", + "plt.plot(df_m[\"day\"], df_m[\"max_P_kw\"])\n", + "print(\"R^2:\", model.score(X, y), (df_m[\"max_P_kw_pred\"] - df_m[\"max_P_kw\"]).abs().sum())" ] }, { @@ -3392,9 +3569,9 @@ ], "source": [ "plt.figure(figsize=(6, 4))\n", - "plt.plot(df_m['day'], df_m['max_P_kw_pred'])\n", - "plt.plot(df_m['day'], df_m['max_P_kw'])\n", - "print('R^2:', model.score(X, y), (df_m['max_P_kw_pred'] - df_m['max_P_kw']).abs().sum())" + "plt.plot(df_m[\"day\"], df_m[\"max_P_kw_pred\"])\n", + "plt.plot(df_m[\"day\"], df_m[\"max_P_kw\"])\n", + "print(\"R^2:\", model.score(X, y), (df_m[\"max_P_kw_pred\"] - df_m[\"max_P_kw\"]).abs().sum())" ] }, { @@ -3837,28 +4014,78 @@ } ], "source": [ - "start_time = f'{year}-{month:02d}-2 00:00:00+10:00'\t # In sydney local time\n", - "end_time = f'{year}-{month:02d}-10 00:00:00+10:00'\t # In sydney local time\n", + "start_time = f\"{year}-{month:02d}-2 00:00:00+10:00\" # In sydney local time\n", + "end_time = f\"{year}-{month:02d}-10 00:00:00+10:00\" # In sydney local time\n", "\n", "\n", - "num_ticks = 24*2+1\n", - "save_as = ''\n", - "x_label = 'time'\n", - "y_labels = ['Active power (kW)', 'Reactive power (kvar)', 'GHI', 'Cloud Type', 'Apparent Power (kVA)', 'Average Voltage (V)', 'Power Factor']\n", - "plt_config = {'P_kw': [1, 0, '-', None, None], 'Q_kvar': [1, 1, '-', None, None], 'GHI': [0, 0, '-', None, None], 'cloud_type': [0, 1, '-', None, None],\n", - "'S_kva': [2, 0, '-', None, None], 'v_avg': [2, 1, '-', None, None], 'pf': [3, 0, '-', None, None]}\n", + "num_ticks = 24 * 2 + 1\n", + "save_as = \"\"\n", + "x_label = \"time\"\n", + "y_labels = [\n", + " \"Active power (kW)\",\n", + " \"Reactive power (kvar)\",\n", + " \"GHI\",\n", + " \"Cloud Type\",\n", + " \"Apparent Power (kVA)\",\n", + " \"Average Voltage (V)\",\n", + " \"Power Factor\",\n", + "]\n", + "plt_config = {\n", + " \"P_kw\": [1, 0, \"-\", None, None],\n", + " \"Q_kvar\": [1, 1, \"-\", None, None],\n", + " \"GHI\": [0, 0, \"-\", None, None],\n", + " \"cloud_type\": [0, 1, \"-\", None, None],\n", + " \"S_kva\": [2, 0, \"-\", None, None],\n", + " \"v_avg\": [2, 1, \"-\", None, None],\n", + " \"pf\": [3, 0, \"-\", None, None],\n", + "}\n", "\n", - "color_nights=False\n", + "color_nights = False\n", "# color_by = 'group'\n", - "color_by = 'attribute'\n", - "ax_digit = '1.1f'\n", - "a=my_plot4(start_time, end_time, df, plt_config=plt_config, ax_digit= ax_digit,\n", - " group_attr='site_id', time_attr='t_stamp', color_nights=color_nights,cmap='plasma',\n", - " figsize=[16/2.54,1.5], same_scale=1, fontsize=5, fontname='DejaVu Sans', plot_total=False, plot_total_func=['sum', [lambda x: max(x), 'max']], \n", - " num_ticks=num_ticks, num_yticks=10, dpi=300, x_format= '%H:%M', \n", - " legend_loc=['center left', 'upper right', 'center left', 'upper right', 'center left'], x_label=x_label, y_labels=y_labels, color_by=color_by,\n", - "plot_period=np.timedelta64(1, 'D'), save_as=save_as, rotation = 60, step=0, gridwidth=[0.2, .2], legend_join='-', title='', \n", - "legend_i=0, title_i=0, only1title=0, onlyntime=0)\n", + "color_by = \"attribute\"\n", + "ax_digit = \"1.1f\"\n", + "a = my_plot4(\n", + " start_time,\n", + " end_time,\n", + " df,\n", + " plt_config=plt_config,\n", + " ax_digit=ax_digit,\n", + " group_attr=\"site_id\",\n", + " time_attr=\"t_stamp\",\n", + " color_nights=color_nights,\n", + " cmap=\"plasma\",\n", + " figsize=[16 / 2.54, 1.5],\n", + " same_scale=1,\n", + " fontsize=5,\n", + " fontname=\"DejaVu Sans\",\n", + " plot_total=False,\n", + " plot_total_func=[\"sum\", [lambda x: max(x), \"max\"]],\n", + " num_ticks=num_ticks,\n", + " num_yticks=10,\n", + " dpi=300,\n", + " x_format=\"%H:%M\",\n", + " legend_loc=[\n", + " \"center left\",\n", + " \"upper right\",\n", + " \"center left\",\n", + " \"upper right\",\n", + " \"center left\",\n", + " ],\n", + " x_label=x_label,\n", + " y_labels=y_labels,\n", + " color_by=color_by,\n", + " plot_period=np.timedelta64(1, \"D\"),\n", + " save_as=save_as,\n", + " rotation=60,\n", + " step=0,\n", + " gridwidth=[0.2, 0.2],\n", + " legend_join=\"-\",\n", + " title=\"\",\n", + " legend_i=0,\n", + " title_i=0,\n", + " only1title=0,\n", + " onlyntime=0,\n", + ")\n", "a.do()" ] }, diff --git a/SolA2024_Analysis/NetMeter.ipynb b/SolA2024_Analysis/NetMeter.ipynb index 3f89ee3..5b5dc90 100644 --- a/SolA2024_Analysis/NetMeter.ipynb +++ b/SolA2024_Analysis/NetMeter.ipynb @@ -7,10 +7,11 @@ "metadata": {}, "outputs": [], "source": [ - "from Data_query.trino_config import *\n", "import numpy as np\n", - "from visualisation import *\n", - "import pytz" + "import pytz\n", + "\n", + "from Data_query.trino_config import *\n", + "from visualisation import *" ] }, { @@ -20,7 +21,7 @@ "metadata": {}, "outputs": [], "source": [ - "stop_trino()\n" + "stop_trino()" ] }, { @@ -40,8 +41,8 @@ } ], "source": [ - "ensure_trino_running(worker_desired_count = 1, big_worker_desired_count=0)\n", - "sleep(30)\n" + "ensure_trino_running(worker_desired_count=1, big_worker_desired_count=0)\n", + "sleep(30)" ] }, { @@ -679,9 +680,8 @@ } ], "source": [ - "\n", "df_list = []\n", - "for year in (2024, ):\n", + "for year in (2024,):\n", " for month in range(11, 12):\n", " df = iceberg_sql(f\"\"\"\n", " with data as (\n", @@ -756,7 +756,11 @@ " where date_trunc('day', t_stamp + interval '10' hour) in (select day from clear_sky)\n", " \n", "\"\"\")\n", - "df['t_stamp'] = pd.to_datetime(df['t_stamp']).dt.tz_localize('utc').dt.tz_convert(pytz.FixedOffset(10*60))" + "df[\"t_stamp\"] = (\n", + " pd.to_datetime(df[\"t_stamp\"])\n", + " .dt.tz_localize(\"utc\")\n", + " .dt.tz_convert(pytz.FixedOffset(10 * 60))\n", + ")" ] }, { @@ -841,27 +845,61 @@ } ], "source": [ - "start_time = '2024-03-1 00:00:00+10:00'\t # In sydney local time\n", - "end_time = '2024-04-1 00:00:00+10:00'\t # In sydney local time\n", + "start_time = \"2024-03-1 00:00:00+10:00\" # In sydney local time\n", + "end_time = \"2024-04-1 00:00:00+10:00\" # In sydney local time\n", "\n", "\n", - "num_ticks = 24*2+1\n", - "save_as = 'Figures/ClearSky_Feb2024.png'\n", - "x_label = 'time'\n", - "y_labels = ['Active power (kW)', 'Reactive power (kvar)', 'GHI', 'Cloud Type']\n", - "plt_config = {'P_kw': [1, 0, '-', None, None], 'Q_kvar': [1, 0, '-', None, None], 'GHI': [0, 0, '-', None, None], 'cloud_type': [0, 1, '-', None, None]}\n", + "num_ticks = 24 * 2 + 1\n", + "save_as = \"Figures/ClearSky_Feb2024.png\"\n", + "x_label = \"time\"\n", + "y_labels = [\"Active power (kW)\", \"Reactive power (kvar)\", \"GHI\", \"Cloud Type\"]\n", + "plt_config = {\n", + " \"P_kw\": [1, 0, \"-\", None, None],\n", + " \"Q_kvar\": [1, 0, \"-\", None, None],\n", + " \"GHI\": [0, 0, \"-\", None, None],\n", + " \"cloud_type\": [0, 1, \"-\", None, None],\n", + "}\n", "\n", - "color_nights=False\n", + "color_nights = False\n", "# color_by = 'group'\n", - "color_by = 'attribute'\n", - "ax_digit = '1.1f'\n", - "a=my_plot4(start_time, end_time, df, plt_config=plt_config, ax_digit= ax_digit,\n", - " group_attr='circuit_type', time_attr='t_stamp', color_nights=color_nights,cmap='plasma',\n", - " figsize=[16/2.54,1.5], same_scale=1, fontsize=5, fontname='DejaVu Sans', plot_total=False, plot_total_func=['sum', [lambda x: max(x), 'max']], \n", - " num_ticks=num_ticks, num_yticks=10, dpi=300, x_format= '%H:%M', \n", - " legend_loc=['lower left', 'upper right'], x_label=x_label, y_labels=y_labels, color_by=color_by,\n", - "plot_period=np.timedelta64(1, 'D'), save_as=save_as, rotation = 60, step=0, gridwidth=[0.2, .2], legend_join='-', title='', \n", - "legend_i=0, title_i=0, only1title=1, onlyntime=0)\n", + "color_by = \"attribute\"\n", + "ax_digit = \"1.1f\"\n", + "a = my_plot4(\n", + " start_time,\n", + " end_time,\n", + " df,\n", + " plt_config=plt_config,\n", + " ax_digit=ax_digit,\n", + " group_attr=\"circuit_type\",\n", + " time_attr=\"t_stamp\",\n", + " color_nights=color_nights,\n", + " cmap=\"plasma\",\n", + " figsize=[16 / 2.54, 1.5],\n", + " same_scale=1,\n", + " fontsize=5,\n", + " fontname=\"DejaVu Sans\",\n", + " plot_total=False,\n", + " plot_total_func=[\"sum\", [lambda x: max(x), \"max\"]],\n", + " num_ticks=num_ticks,\n", + " num_yticks=10,\n", + " dpi=300,\n", + " x_format=\"%H:%M\",\n", + " legend_loc=[\"lower left\", \"upper right\"],\n", + " x_label=x_label,\n", + " y_labels=y_labels,\n", + " color_by=color_by,\n", + " plot_period=np.timedelta64(1, \"D\"),\n", + " save_as=save_as,\n", + " rotation=60,\n", + " step=0,\n", + " gridwidth=[0.2, 0.2],\n", + " legend_join=\"-\",\n", + " title=\"\",\n", + " legend_i=0,\n", + " title_i=0,\n", + " only1title=1,\n", + " onlyntime=0,\n", + ")\n", "a.do()" ] } diff --git a/SolA2024_Analysis/OEM_installDate_confrate.ipynb b/SolA2024_Analysis/OEM_installDate_confrate.ipynb index 04049e6..7b17c69 100644 --- a/SolA2024_Analysis/OEM_installDate_confrate.ipynb +++ b/SolA2024_Analysis/OEM_installDate_confrate.ipynb @@ -7,10 +7,12 @@ "metadata": {}, "outputs": [], "source": [ - "from Data_query.trino_config import *\n", "import json\n", + "\n", + "import matplotlib.pyplot as plt\n", "import numpy as np\n", - "import matplotlib.pyplot as plt" + "\n", + "from Data_query.trino_config import *" ] }, { @@ -41,7 +43,7 @@ "big_workers = 0\n", "workers = 1\n", "num_workers = max(workers, big_workers)\n", - "ensure_trino_running(worker_desired_count = workers, big_worker_desired_count=big_workers)\n", + "ensure_trino_running(worker_desired_count=workers, big_worker_desired_count=big_workers)\n", "sleep(30)" ] }, diff --git a/SolA2024_Analysis/PQRMs.ipynb b/SolA2024_Analysis/PQRMs.ipynb index a129fa6..9e889c2 100644 --- a/SolA2024_Analysis/PQRMs.ipynb +++ b/SolA2024_Analysis/PQRMs.ipynb @@ -21,35 +21,37 @@ " if V < v1:\n", " return Srated\n", " elif V > v2:\n", - " return .2 * Srated\n", + " return 0.2 * Srated\n", " else:\n", - " m = (Srated - .2*Srated) / (v1 - v2)\n", - " P = m * (V - v2) + .2*Srated\n", + " m = (Srated - 0.2 * Srated) / (v1 - v2)\n", + " P = m * (V - v2) + 0.2 * Srated\n", " return P\n", - " \n", - "def get_voltvar_Q(V, P, Srated=1, v1=207, v2=220, v3=240, v4=258, Q1=.44, Q4=.60):\n", - " if np.abs(P) < .2 * Srated:\n", + "\n", + "\n", + "def get_voltvar_Q(V, P, Srated=1, v1=207, v2=220, v3=240, v4=258, Q1=0.44, Q4=0.60):\n", + " if np.abs(P) < 0.2 * Srated:\n", " return 0\n", " if V <= v1:\n", - " Q = Q1* Srated\n", + " Q = Q1 * Srated\n", " elif v1 <= V < v2:\n", - " m = (Q1* Srated - 0) / (v1 - v2)\n", + " m = (Q1 * Srated - 0) / (v1 - v2)\n", " Q = m * (V - v2)\n", " elif v2 <= V <= v3:\n", " Q = 0\n", " elif v3 < V < v4:\n", - " m = (0 - Q4* Srated) / (v3 - v4)\n", - " Q = -m * (V - v4) - Q4* Srated\n", + " m = (0 - Q4 * Srated) / (v3 - v4)\n", + " Q = -m * (V - v4) - Q4 * Srated\n", " else: # V >= v4\n", - " Q = - Q4* Srated\n", + " Q = -Q4 * Srated\n", " return Q\n", "\n", + "\n", "def Q_capability(P, Srated=1):\n", - " if P < .2 * Srated:\n", + " if P < 0.2 * Srated:\n", " Q = 0\n", - " elif P <= .6 * Srated:\n", + " elif P <= 0.6 * Srated:\n", " Q = 0.44 * Srated\n", - " elif P <= .8 * Srated:\n", + " elif P <= 0.8 * Srated:\n", " S_pf = P / 0.8\n", " Q = np.sqrt(S_pf**2 - P**2)\n", " else:\n", @@ -65,16 +67,20 @@ "outputs": [], "source": [ "S_rated = 100\n", - "V_vec = np.arange(240, 253.01, .01)\n", + "V_vec = np.arange(240, 253.01, 0.01)\n", "V_vec = np.round(V_vec, 2)\n", - "df = pd.DataFrame(V_vec, columns=['V'])\n", - "df['P_only'] = df['V'].apply(lambda x: get_max_P(x, Srated=S_rated))\n", - "df['Q_only'] = df.apply(lambda row: get_voltvar_Q(row['V'], 20, Srated=S_rated), axis=1)\n", - "df['max_P_with_Q_only'] = (S_rated**2 - df['Q_only']**2)**0.5\n", - "df['max_P_with_Q_nd_P'] = df.apply(lambda row: np.minimum(row['P_only'], row['max_P_with_Q_only']), axis=1)\n", - "df['absorb_Q'] = df.apply(lambda row: Q_capability(row['max_P_with_Q_nd_P'], Srated=S_rated), axis=1)\n", - "df['supply_Q'] = - df['absorb_Q']\n", - "df['S_absorbing'] = (df['absorb_Q']**2 + df['max_P_with_Q_nd_P']**2)**0.5\n" + "df = pd.DataFrame(V_vec, columns=[\"V\"])\n", + "df[\"P_only\"] = df[\"V\"].apply(lambda x: get_max_P(x, Srated=S_rated))\n", + "df[\"Q_only\"] = df.apply(lambda row: get_voltvar_Q(row[\"V\"], 20, Srated=S_rated), axis=1)\n", + "df[\"max_P_with_Q_only\"] = (S_rated**2 - df[\"Q_only\"] ** 2) ** 0.5\n", + "df[\"max_P_with_Q_nd_P\"] = df.apply(\n", + " lambda row: np.minimum(row[\"P_only\"], row[\"max_P_with_Q_only\"]), axis=1\n", + ")\n", + "df[\"absorb_Q\"] = df.apply(\n", + " lambda row: Q_capability(row[\"max_P_with_Q_nd_P\"], Srated=S_rated), axis=1\n", + ")\n", + "df[\"supply_Q\"] = -df[\"absorb_Q\"]\n", + "df[\"S_absorbing\"] = (df[\"absorb_Q\"] ** 2 + df[\"max_P_with_Q_nd_P\"] ** 2) ** 0.5" ] }, { @@ -107,20 +113,25 @@ "source": [ "fig = plt.figure(figsize=(10, 3), dpi=200)\n", "ax = fig.add_subplot(111)\n", - "ax.plot(df['V'], df['Q_only'], label='Volt-Var Response ', color='orange')\n", - "ax.plot(df['V'], df['max_P_with_Q_only'], label='Max Power with Volt-VAR Response', color='green')\n", + "ax.plot(df[\"V\"], df[\"Q_only\"], label=\"Volt-Var Response \", color=\"orange\")\n", + "ax.plot(\n", + " df[\"V\"],\n", + " df[\"max_P_with_Q_only\"],\n", + " label=\"Max Power with Volt-VAR Response\",\n", + " color=\"green\",\n", + ")\n", "# ax.plot(df['V'], df['max_P_with_Q_nd_P'], label='Max Power with Volt-VAR and Volt-Watt Response', linestyle='dotted', linewidth=4, color='red')\n", "# ax2.plot(df['V'], df['absorb_Q'], label='Absorbing Q limit', color='yellow')\n", "# ax2.plot(df['V'], df['supply_Q'], label='Supplying Q limit', color='purple')\n", "\n", - "ax.set_xlabel('Voltage (V)')\n", + "ax.set_xlabel(\"Voltage (V)\")\n", "# ax.set_ylabel('Max Power (%)')\n", - "ax.set_ylabel('Required Reactive Power (%)')\n", + "ax.set_ylabel(\"Required Reactive Power (%)\")\n", "# ax.set_xticks([min(V_vec), max(V_vec), 253, 254, 254.75, 255, 255.6, 256, 258, 260, 265])\n", "ax.set_xticks([min(V_vec), max(V_vec), 220, 240, 260, 265])\n", - "ax.set_xticklabels([f'{x}' for x in ax.get_xticks()], rotation=0)\n", - "ax.grid(True, which='both', linestyle='--', linewidth=0.5)\n", - "ax.legend(loc='upper right')\n", + "ax.set_xticklabels([f\"{x}\" for x in ax.get_xticks()], rotation=0)\n", + "ax.grid(True, which=\"both\", linestyle=\"--\", linewidth=0.5)\n", + "ax.legend(loc=\"upper right\")\n", "# ax.set_ylim(0, 105)\n", "# ax2.legend(loc='center left')" ] @@ -133,16 +144,20 @@ "outputs": [], "source": [ "S_rated = 100\n", - "V_vec = np.arange(240, 262.01, .01)\n", + "V_vec = np.arange(240, 262.01, 0.01)\n", "V_vec = np.round(V_vec, 2)\n", - "df = pd.DataFrame(V_vec, columns=['V'])\n", - "df['P_only'] = df['V'].apply(lambda x: get_max_P(x, Srated=S_rated))\n", - "df['Q_only'] = df.apply(lambda row: get_voltvar_Q(row['V'], 20, Srated=S_rated), axis=1)\n", - "df['max_P_with_Q_only'] = (S_rated**2 - df['Q_only']**2)**0.5\n", - "df['max_P_with_Q_nd_P'] = df.apply(lambda row: np.minimum(row['P_only'], row['max_P_with_Q_only']), axis=1)\n", - "df['absorb_Q'] = df.apply(lambda row: Q_capability(row['max_P_with_Q_nd_P'], Srated=S_rated), axis=1)\n", - "df['supply_Q'] = - df['absorb_Q']\n", - "df['S_absorbing'] = (df['absorb_Q']**2 + df['max_P_with_Q_nd_P']**2)**0.5\n" + "df = pd.DataFrame(V_vec, columns=[\"V\"])\n", + "df[\"P_only\"] = df[\"V\"].apply(lambda x: get_max_P(x, Srated=S_rated))\n", + "df[\"Q_only\"] = df.apply(lambda row: get_voltvar_Q(row[\"V\"], 20, Srated=S_rated), axis=1)\n", + "df[\"max_P_with_Q_only\"] = (S_rated**2 - df[\"Q_only\"] ** 2) ** 0.5\n", + "df[\"max_P_with_Q_nd_P\"] = df.apply(\n", + " lambda row: np.minimum(row[\"P_only\"], row[\"max_P_with_Q_only\"]), axis=1\n", + ")\n", + "df[\"absorb_Q\"] = df.apply(\n", + " lambda row: Q_capability(row[\"max_P_with_Q_nd_P\"], Srated=S_rated), axis=1\n", + ")\n", + "df[\"supply_Q\"] = -df[\"absorb_Q\"]\n", + "df[\"S_absorbing\"] = (df[\"absorb_Q\"] ** 2 + df[\"max_P_with_Q_nd_P\"] ** 2) ** 0.5" ] }, { @@ -222,9 +237,8 @@ "metadata": {}, "outputs": [], "source": [ - "font = {'weight': 'normal',\n", - " 'size' : 9}\n", - "matplotlib.rc('font', **font)" + "font = {\"weight\": \"normal\", \"size\": 9}\n", + "matplotlib.rc(\"font\", **font)" ] }, { @@ -247,17 +261,17 @@ "source": [ "fig = plt.figure(figsize=(6, 2.5), dpi=300)\n", "ax = fig.add_subplot(111)\n", - "ax.plot(df['V'], df['P_only'], label='Volt-Watt Response ', color='blue')\n", - "ax.set_xlabel('Voltage (V)')\n", - "ax.set_ylabel('Maximum power, P/$S_{rated}$ (%)')\n", + "ax.plot(df[\"V\"], df[\"P_only\"], label=\"Volt-Watt Response \", color=\"blue\")\n", + "ax.set_xlabel(\"Voltage (V)\")\n", + "ax.set_ylabel(\"Maximum power, P/$S_{rated}$ (%)\")\n", "ax.set_xticks([min(V_vec), max(V_vec), 253, 260, 265])\n", - "ax.set_xticklabels([f'{x}' for x in ax.get_xticks()], rotation=0)\n", - "ax.grid(True, which='both', linestyle='--', linewidth=0.5)\n", + "ax.set_xticklabels([f\"{x}\" for x in ax.get_xticks()], rotation=0)\n", + "ax.grid(True, which=\"both\", linestyle=\"--\", linewidth=0.5)\n", "# ax.legend(loc='center left')\n", "ax.set_ylim(0, 105)\n", "ax.margins(x=0)\n", "plt.tight_layout()\n", - "plt.savefig('Figures/volt_watt_response.png', dpi=300, bbox_inches='tight')\n", + "plt.savefig(\"Figures/volt_watt_response.png\", dpi=300, bbox_inches=\"tight\")\n", "# plt.show()" ] }, @@ -291,20 +305,20 @@ "source": [ "fig = plt.figure(figsize=(10, 3), dpi=200)\n", "ax = fig.add_subplot(111)\n", - "ax.plot(df['V'], df['Q_only'], label='Volt-Var Response ', color='orange')\n", + "ax.plot(df[\"V\"], df[\"Q_only\"], label=\"Volt-Var Response \", color=\"orange\")\n", "# ax.plot(df['V'], df['max_P_with_Q_only'], label='Max Power with Volt-VAR Response', color='green')\n", "# ax.plot(df['V'], df['max_P_with_Q_nd_P'], label='Max Power with Volt-VAR and Volt-Watt Response', linestyle='dotted', linewidth=4, color='red')\n", "# ax2.plot(df['V'], df['absorb_Q'], label='Absorbing Q limit', color='yellow')\n", "# ax2.plot(df['V'], df['supply_Q'], label='Supplying Q limit', color='purple')\n", "\n", - "ax.set_xlabel('Voltage (V)')\n", + "ax.set_xlabel(\"Voltage (V)\")\n", "# ax.set_ylabel('Max Power (%)')\n", - "ax.set_ylabel('Required Reactive Power (%)')\n", + "ax.set_ylabel(\"Required Reactive Power (%)\")\n", "# ax.set_xticks([min(V_vec), max(V_vec), 253, 254, 254.75, 255, 255.6, 256, 258, 260, 265])\n", "ax.set_xticks([min(V_vec), max(V_vec), 220, 240, 260, 265])\n", - "ax.set_xticklabels([f'{x}' for x in ax.get_xticks()], rotation=0)\n", - "ax.grid(True, which='both', linestyle='--', linewidth=0.5)\n", - "ax.legend(loc='upper right')\n", + "ax.set_xticklabels([f\"{x}\" for x in ax.get_xticks()], rotation=0)\n", + "ax.grid(True, which=\"both\", linestyle=\"--\", linewidth=0.5)\n", + "ax.legend(loc=\"upper right\")\n", "# ax.set_ylim(0, 105)\n", "# ax2.legend(loc='center left')" ] @@ -340,21 +354,33 @@ "fig = plt.figure(figsize=(14, 7), dpi=200)\n", "ax = fig.add_subplot(111)\n", "ax2 = ax.twinx()\n", - "ax.plot(df['V'], df['P_only'], label='Volt-Watt Response Only', color='blue')\n", - "ax2.plot(df['V'], df['Q_only'], label='Volt-VAR Response Only', color='orange')\n", - "ax.plot(df['V'], df['max_P_with_Q_only'], label='Max Power with Volt-VAR Response', color='green')\n", - "ax.plot(df['V'], df['max_P_with_Q_nd_P'], label='Max Power with Volt-VAR and Volt-Watt Response', linestyle='dotted', linewidth=4, color='red')\n", + "ax.plot(df[\"V\"], df[\"P_only\"], label=\"Volt-Watt Response Only\", color=\"blue\")\n", + "ax2.plot(df[\"V\"], df[\"Q_only\"], label=\"Volt-VAR Response Only\", color=\"orange\")\n", + "ax.plot(\n", + " df[\"V\"],\n", + " df[\"max_P_with_Q_only\"],\n", + " label=\"Max Power with Volt-VAR Response\",\n", + " color=\"green\",\n", + ")\n", + "ax.plot(\n", + " df[\"V\"],\n", + " df[\"max_P_with_Q_nd_P\"],\n", + " label=\"Max Power with Volt-VAR and Volt-Watt Response\",\n", + " linestyle=\"dotted\",\n", + " linewidth=4,\n", + " color=\"red\",\n", + ")\n", "# ax2.plot(df['V'], df['absorb_Q'], label='Absorbing Q limit', color='yellow')\n", "# ax2.plot(df['V'], df['supply_Q'], label='Supplying Q limit', color='purple')\n", "\n", - "ax.set_xlabel('Voltage (V)')\n", - "ax.set_ylabel('Max Power (kW)')\n", - "ax2.set_ylabel('Required Reactive Power (kVAR)')\n", + "ax.set_xlabel(\"Voltage (V)\")\n", + "ax.set_ylabel(\"Max Power (kW)\")\n", + "ax2.set_ylabel(\"Required Reactive Power (kVAR)\")\n", "ax.set_xticks([min(V_vec), max(V_vec), 220, 240, 253, 255.6, 258, 260])\n", - "ax.set_xticklabels([f'{x}' for x in ax.get_xticks()], rotation=45)\n", - "ax.grid(True, which='both', linestyle='--', linewidth=0.5)\n", - "ax.legend(loc='lower left')\n", - "ax2.legend(loc='center left')" + "ax.set_xticklabels([f\"{x}\" for x in ax.get_xticks()], rotation=45)\n", + "ax.grid(True, which=\"both\", linestyle=\"--\", linewidth=0.5)\n", + "ax.legend(loc=\"lower left\")\n", + "ax2.legend(loc=\"center left\")" ] }, { @@ -386,26 +412,40 @@ ], "source": [ "S_rated = 100\n", - "V_vec = np.arange(240, 265, .001)\n", - "df = pd.DataFrame(V_vec, columns=['V'])\n", - "df['P_only'] = df['V'].apply(lambda x: get_max_P(x, Srated=S_rated))\n", - "df['Q_only'] = df.apply(lambda row: get_voltvar_Q(row['V'], 20, Srated=S_rated), axis=1)\n", - "df['max_P_with_Q_only'] = (S_rated**2 - df['Q_only']**2)**0.5\n", - "df['max_P_with_Q_nd_P'] = df.apply(lambda row: np.minimum(row['P_only'], row['max_P_with_Q_only']), axis=1)\n", + "V_vec = np.arange(240, 265, 0.001)\n", + "df = pd.DataFrame(V_vec, columns=[\"V\"])\n", + "df[\"P_only\"] = df[\"V\"].apply(lambda x: get_max_P(x, Srated=S_rated))\n", + "df[\"Q_only\"] = df.apply(lambda row: get_voltvar_Q(row[\"V\"], 20, Srated=S_rated), axis=1)\n", + "df[\"max_P_with_Q_only\"] = (S_rated**2 - df[\"Q_only\"] ** 2) ** 0.5\n", + "df[\"max_P_with_Q_nd_P\"] = df.apply(\n", + " lambda row: np.minimum(row[\"P_only\"], row[\"max_P_with_Q_only\"]), axis=1\n", + ")\n", "fig = plt.figure(figsize=(10, 3), dpi=200)\n", "ax = fig.add_subplot(111)\n", "# ax2 = ax.twinx()\n", - "ax.plot(df['V'], df['P_only'], label='Volt-Watt Response Only', color='blue')\n", - "ax2.plot(df['V'], df['Q_only'], label='Volt-VAR Response Only', color='orange')\n", - "ax.plot(df['V'], df['max_P_with_Q_only'], label='Max Power with Volt-VAR Response', color='green')\n", - "ax.plot(df['V'], df['max_P_with_Q_nd_P'], label='Max Power with Volt-VAR and Volt-Watt Response', linestyle='dotted', linewidth=4, color='red')\n", - "ax.set_xlabel('Voltage (V)')\n", - "ax.set_ylabel('Max Power (kW)')\n", + "ax.plot(df[\"V\"], df[\"P_only\"], label=\"Volt-Watt Response Only\", color=\"blue\")\n", + "ax2.plot(df[\"V\"], df[\"Q_only\"], label=\"Volt-VAR Response Only\", color=\"orange\")\n", + "ax.plot(\n", + " df[\"V\"],\n", + " df[\"max_P_with_Q_only\"],\n", + " label=\"Max Power with Volt-VAR Response\",\n", + " color=\"green\",\n", + ")\n", + "ax.plot(\n", + " df[\"V\"],\n", + " df[\"max_P_with_Q_nd_P\"],\n", + " label=\"Max Power with Volt-VAR and Volt-Watt Response\",\n", + " linestyle=\"dotted\",\n", + " linewidth=4,\n", + " color=\"red\",\n", + ")\n", + "ax.set_xlabel(\"Voltage (V)\")\n", + "ax.set_ylabel(\"Max Power (kW)\")\n", "# ax2.set_ylabel('Required Reactive Power (kVAR)')\n", - "ax.set_xticks([min(V_vec), max(V_vec),253, 254, 255, 256, 260])\n", - "ax.grid(True, which='both', linestyle='--', linewidth=0.5)\n", - "ax.legend(loc='lower left')\n", - "ax2.legend(loc='center left')" + "ax.set_xticks([min(V_vec), max(V_vec), 253, 254, 255, 256, 260])\n", + "ax.grid(True, which=\"both\", linestyle=\"--\", linewidth=0.5)\n", + "ax.legend(loc=\"lower left\")\n", + "ax2.legend(loc=\"center left\")" ] } ], diff --git a/SolA2024_Analysis/README.md b/SolA2024_Analysis/README.md index 53c8081..9585553 100644 --- a/SolA2024_Analysis/README.md +++ b/SolA2024_Analysis/README.md @@ -79,4 +79,4 @@ Please note that not all data is present for every circuit, depending on the dev ## High-resolution Emergency Backstop Measures (EBM) data -EBM data is located in `ebm_data/` and comprises...TBD \ No newline at end of file +EBM data is located in `ebm_data/` and comprises...TBD diff --git a/SolA2024_Analysis/V-Var-Conf-impact.ipynb b/SolA2024_Analysis/V-Var-Conf-impact.ipynb index ec86a67..e091a15 100644 --- a/SolA2024_Analysis/V-Var-Conf-impact.ipynb +++ b/SolA2024_Analysis/V-Var-Conf-impact.ipynb @@ -8,12 +8,14 @@ "outputs": [], "source": [ "# investigating the impact of voltvar conformance on local voltage. Would it still go above 253?!\n", - "from Data_query.trino_config import *\n", "import json\n", - "import numpy as np\n", + "\n", "import matplotlib.pyplot as plt\n", - "from visualisation import *\n", - "import pytz\n" + "import numpy as np\n", + "import pytz\n", + "\n", + "from Data_query.trino_config import *\n", + "from visualisation import *" ] }, { @@ -31,7 +33,7 @@ } ], "source": [ - "stop_trino()\n" + "stop_trino()" ] }, { @@ -52,7 +54,7 @@ ], "source": [ "num_workers = 0\n", - "ensure_trino_running(worker_desired_count = 1, big_worker_desired_count=num_workers)\n", + "ensure_trino_running(worker_desired_count=1, big_worker_desired_count=num_workers)\n", "sleep(30)" ] }, @@ -166,17 +168,19 @@ } ], "source": [ - "v1=207\n", - "v2=220\n", - "v3=240\n", - "v4=258\n", - "Q1=.44\n", - "Q4=.60\n", + "v1 = 207\n", + "v2 = 220\n", + "v3 = 240\n", + "v4 = 258\n", + "Q1 = 0.44\n", + "Q4 = 0.60\n", "thr1 = -0.1\n", "thr2 = 0.1\n", - "thr3 = .9\n", + "thr3 = 0.9\n", "thr4 = 1.1\n", "df_list = []\n", + "\n", + "\n", "def run_func(args):\n", " year, month, split_cons, v1, v2, v3, v4, Q1, Q4, thr1, thr2, thr3, thr4 = args\n", " df = iceberg_sql(f\"\"\"\n", @@ -265,16 +269,27 @@ " where V > 253 and nonconformance_voltvar = 0 and Q_voltvar_max_final < 0\n", " group by year, month, site_id\n", " \"\"\")\n", - " \n", - " print(f\"Completed year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\")\n", + "\n", + " print(\n", + " f\"Completed year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\"\n", + " )\n", " return df\n", "\n", - "tasks = [(year, month, split_cons, v1, v2, v3, v4, Q1, Q4, thr1, thr2, thr3, thr4) for year in (2024, 2025) for month in range(1, 13) \n", - " for split_cons in ['system.bucket(postcode, 16) <= 3', '(system.bucket(postcode, 16) > 3 and system.bucket(postcode, 16) <= 7)', \n", - " '(system.bucket(postcode, 16) > 7 and system.bucket(postcode, 16) <= 11)', 'system.bucket(postcode, 16) > 11'] ]\n", - " # for split_cons in ['system.bucket(postcode, 16) <=7 ', 'system.bucket(postcode, 16) > 7'] ]\n", - " \n", - "df_sites = trino_parallel(run_func, tasks, num_workers=num_workers)\n" + "\n", + "tasks = [\n", + " (year, month, split_cons, v1, v2, v3, v4, Q1, Q4, thr1, thr2, thr3, thr4)\n", + " for year in (2024, 2025)\n", + " for month in range(1, 13)\n", + " for split_cons in [\n", + " \"system.bucket(postcode, 16) <= 3\",\n", + " \"(system.bucket(postcode, 16) > 3 and system.bucket(postcode, 16) <= 7)\",\n", + " \"(system.bucket(postcode, 16) > 7 and system.bucket(postcode, 16) <= 11)\",\n", + " \"system.bucket(postcode, 16) > 11\",\n", + " ]\n", + "]\n", + "# for split_cons in ['system.bucket(postcode, 16) <=7 ', 'system.bucket(postcode, 16) > 7'] ]\n", + "\n", + "df_sites = trino_parallel(run_func, tasks, num_workers=num_workers)" ] }, { @@ -423,19 +438,30 @@ " group by site_id\n", " \n", " \"\"\")\n", - " # \n", - " \n", - " print(f\"Completed year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\")\n", + " #\n", + "\n", + " print(\n", + " f\"Completed year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\"\n", + " )\n", " return df\n", "\n", - "tasks = [(year, month, split_cons) for year in (2024, 2025) for month in range(1, 13) \n", - " for split_cons in ['system.bucket(postcode, 16) <= 3', '(system.bucket(postcode, 16) > 3 and system.bucket(postcode, 16) <= 7)', \n", - " '(system.bucket(postcode, 16) > 7 and system.bucket(postcode, 16) <= 11)', 'system.bucket(postcode, 16) > 11'] ]\n", - " # for split_cons in ['system.bucket(postcode, 16) <=7 ', 'system.bucket(postcode, 16) > 7'] ]\n", "\n", - " \n", + "tasks = [\n", + " (year, month, split_cons)\n", + " for year in (2024, 2025)\n", + " for month in range(1, 13)\n", + " for split_cons in [\n", + " \"system.bucket(postcode, 16) <= 3\",\n", + " \"(system.bucket(postcode, 16) > 3 and system.bucket(postcode, 16) <= 7)\",\n", + " \"(system.bucket(postcode, 16) > 7 and system.bucket(postcode, 16) <= 11)\",\n", + " \"system.bucket(postcode, 16) > 11\",\n", + " ]\n", + "]\n", + "# for split_cons in ['system.bucket(postcode, 16) <=7 ', 'system.bucket(postcode, 16) > 7'] ]\n", + "\n", + "\n", "df_total_solar = trino_parallel(run_func, tasks, num_workers=num_workers)\n", - "df_total_solar['total_time_pv'].sum()" + "df_total_solar[\"total_time_pv\"].sum()" ] }, { @@ -548,17 +574,19 @@ } ], "source": [ - "v1=207\n", - "v2=220\n", - "v3=240\n", - "v4=258\n", - "Q1=.44\n", - "Q4=.60\n", + "v1 = 207\n", + "v2 = 220\n", + "v3 = 240\n", + "v4 = 258\n", + "Q1 = 0.44\n", + "Q4 = 0.60\n", "thr1 = -0.1\n", "thr2 = 0.1\n", - "thr3 = .9\n", + "thr3 = 0.9\n", "thr4 = 1.1\n", "df_list = []\n", + "\n", + "\n", "def run_func(args):\n", " year, month, split_cons, v1, v2, v3, v4, Q1, Q4, thr1, thr2, thr3, thr4 = args\n", " df = iceberg_sql(f\"\"\"\n", @@ -646,16 +674,27 @@ " from pq6\n", " group by year, month, site_id\n", " \"\"\")\n", - " \n", - " print(f\"Completed year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\")\n", + "\n", + " print(\n", + " f\"Completed year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\"\n", + " )\n", " return df\n", "\n", - "tasks = [(year, month, split_cons, v1, v2, v3, v4, Q1, Q4, thr1, thr2, thr3, thr4) for year in (2024, 2025) for month in range(1, 13) \n", - " for split_cons in ['system.bucket(postcode, 16) <= 3', '(system.bucket(postcode, 16) > 3 and system.bucket(postcode, 16) <= 7)', \n", - " '(system.bucket(postcode, 16) > 7 and system.bucket(postcode, 16) <= 11)', 'system.bucket(postcode, 16) > 11'] ]\n", - " # for split_cons in ['system.bucket(postcode, 16) <=7 ', 'system.bucket(postcode, 16) > 7'] ]\n", - " \n", - "df_total = trino_parallel(run_func, tasks, num_workers=num_workers)\n" + "\n", + "tasks = [\n", + " (year, month, split_cons, v1, v2, v3, v4, Q1, Q4, thr1, thr2, thr3, thr4)\n", + " for year in (2024, 2025)\n", + " for month in range(1, 13)\n", + " for split_cons in [\n", + " \"system.bucket(postcode, 16) <= 3\",\n", + " \"(system.bucket(postcode, 16) > 3 and system.bucket(postcode, 16) <= 7)\",\n", + " \"(system.bucket(postcode, 16) > 7 and system.bucket(postcode, 16) <= 11)\",\n", + " \"system.bucket(postcode, 16) > 11\",\n", + " ]\n", + "]\n", + "# for split_cons in ['system.bucket(postcode, 16) <=7 ', 'system.bucket(postcode, 16) > 7'] ]\n", + "\n", + "df_total = trino_parallel(run_func, tasks, num_workers=num_workers)" ] }, { @@ -878,15 +917,17 @@ "metadata": {}, "outputs": [], "source": [ - "df1 = df_sites.groupby('site_id').agg({'n_time': 'sum'}).reset_index()\n", - "df2 = df_total.groupby('site_id').agg({'total_time': 'sum'}).reset_index()\n", - "df0 = df_total_solar.groupby('site_id').agg({'total_time_pv': 'sum'}).reset_index()\n", - "df3 = df1.merge(df2, on='site_id')\n", - "df3 = df3.merge(df0, on='site_id')\n", - "df3['n_time_percentage'] = df3['n_time'] / df3['total_time'] * 100\n", - "df3['n_time_percentage_solar'] = df3['n_time'] / df3['total_time_pv'] * 100\n", - "df3 = df3.sort_values(by='n_time_percentage_solar', ascending=False).reset_index(drop=True)\n", - "df3['s_id'] = df3.index" + "df1 = df_sites.groupby(\"site_id\").agg({\"n_time\": \"sum\"}).reset_index()\n", + "df2 = df_total.groupby(\"site_id\").agg({\"total_time\": \"sum\"}).reset_index()\n", + "df0 = df_total_solar.groupby(\"site_id\").agg({\"total_time_pv\": \"sum\"}).reset_index()\n", + "df3 = df1.merge(df2, on=\"site_id\")\n", + "df3 = df3.merge(df0, on=\"site_id\")\n", + "df3[\"n_time_percentage\"] = df3[\"n_time\"] / df3[\"total_time\"] * 100\n", + "df3[\"n_time_percentage_solar\"] = df3[\"n_time\"] / df3[\"total_time_pv\"] * 100\n", + "df3 = df3.sort_values(by=\"n_time_percentage_solar\", ascending=False).reset_index(\n", + " drop=True\n", + ")\n", + "df3[\"s_id\"] = df3.index" ] }, { @@ -909,20 +950,20 @@ "source": [ "fig, ax = plt.subplots(figsize=(10, 3), dpi=300)\n", "df3.plot(\n", - " kind='bar',\n", + " kind=\"bar\",\n", " ax=ax,\n", - " x='s_id',\n", - " y='n_time_percentage',\n", - " title='Percentage of number of timestamps with V > 253 and correct VoltVar behaviour per site',\n", - " ylabel='Percentage of time (%)',\n", + " x=\"s_id\",\n", + " y=\"n_time_percentage\",\n", + " title=\"Percentage of number of timestamps with V > 253 and correct VoltVar behaviour per site\",\n", + " ylabel=\"Percentage of time (%)\",\n", ")\n", "\n", "# Set custom tick positions\n", - "xticks_positions = np.linspace(0, df3.shape[0]-1, 30, dtype=int)\n", + "xticks_positions = np.linspace(0, df3.shape[0] - 1, 30, dtype=int)\n", "ax.set_xticks(xticks_positions)\n", "\n", "# Optionally label them (if you want specific site IDs)\n", - "ax.set_xticklabels(df3['s_id'].iloc[xticks_positions], rotation=45, ha='right')\n", + "ax.set_xticklabels(df3[\"s_id\"].iloc[xticks_positions], rotation=45, ha=\"right\")\n", "plt.grid()\n", "plt.tight_layout()\n", "plt.show()" @@ -948,20 +989,20 @@ "source": [ "fig, ax = plt.subplots(figsize=(10, 3), dpi=300)\n", "df3.plot(\n", - " kind='bar',\n", + " kind=\"bar\",\n", " ax=ax,\n", - " x='s_id',\n", - " y='n_time_percentage_solar',\n", - " title='Percentage of number of timestamps with V > 253 and correct VoltVar behaviour per site',\n", - " ylabel='Percentage of solar time (%)',\n", + " x=\"s_id\",\n", + " y=\"n_time_percentage_solar\",\n", + " title=\"Percentage of number of timestamps with V > 253 and correct VoltVar behaviour per site\",\n", + " ylabel=\"Percentage of solar time (%)\",\n", ")\n", "\n", "# Set custom tick positions\n", - "xticks_positions = np.linspace(0, df3.shape[0]-1, 30, dtype=int)\n", + "xticks_positions = np.linspace(0, df3.shape[0] - 1, 30, dtype=int)\n", "ax.set_xticks(xticks_positions)\n", "\n", "# Optionally label them (if you want specific site IDs)\n", - "ax.set_xticklabels(df3['s_id'].iloc[xticks_positions], rotation=45, ha='right')\n", + "ax.set_xticklabels(df3[\"s_id\"].iloc[xticks_positions], rotation=45, ha=\"right\")\n", "plt.grid()\n", "plt.tight_layout()\n", "plt.show()" @@ -1029,7 +1070,7 @@ " group by circuit_id, t_stamp\n", " having count(t_stamp) > 1\n", " limit 10\n", - " \"\"\")\n" + " \"\"\")" ] }, { @@ -1146,7 +1187,7 @@ " from ts \n", " where year=2024 and month=10 and is_pv=True \n", " and circuit_id = 373631 and t_stamp = TIMESTAMP '2024-10-16 01:20:00'\n", - " \"\"\")\n" + " \"\"\")" ] }, { @@ -1237,7 +1278,7 @@ " from ts \n", " where year=2024 and month=10 and is_pv=True \n", " and circuit_id = 373631 and t_stamp = TIMESTAMP '2024-10-16 01:20:00'\n", - " \"\"\")\n" + " \"\"\")" ] }, { @@ -1290,7 +1331,9 @@ } ], "source": [ - "hive_sql(f\"\"\"select count( t_stamp) from ts where year=2024 and month=12 and is_pv=True limit 10\"\"\")\n" + "hive_sql(\n", + " f\"\"\"select count( t_stamp) from ts where year=2024 and month=12 and is_pv=True limit 10\"\"\"\n", + ")" ] }, { diff --git a/SolA2024_Analysis/Volt-Var-Trino.ipynb b/SolA2024_Analysis/Volt-Var-Trino.ipynb index f3d2141..61916f1 100644 --- a/SolA2024_Analysis/Volt-Var-Trino.ipynb +++ b/SolA2024_Analysis/Volt-Var-Trino.ipynb @@ -7,13 +7,16 @@ "metadata": {}, "outputs": [], "source": [ - "from Data_query.trino_config import *\n", "import json\n", - "import numpy as np\n", + "\n", "import boto3\n", + "import numpy as np\n", "from sklearn.neighbors import KDTree\n", + "\n", + "from Data_query.trino_config import *\n", + "\n", "session = boto3.Session()\n", - "s3 = boto3.client('s3')\n", + "s3 = boto3.client(\"s3\")\n", "import random" ] }, @@ -53,8 +56,8 @@ "big_workers = 2\n", "workers = 0\n", "num_workers = max(workers, big_workers)\n", - "ensure_trino_running(worker_desired_count = workers, big_worker_desired_count=big_workers)\n", - "sleep(90)\n" + "ensure_trino_running(worker_desired_count=workers, big_worker_desired_count=big_workers)\n", + "sleep(90)" ] }, { @@ -65,18 +68,18 @@ "outputs": [], "source": [ "def Q_impact(row):\n", - " r1 = abs(row['Q_kvar']) / (abs(row['Q_voltvar_max_final']) + 1e-9)\n", - " r2 = abs(row['Q_kvar']) / (abs(row['Q_voltvar_min_final']) + 1e-9)\n", + " r1 = abs(row[\"Q_kvar\"]) / (abs(row[\"Q_voltvar_max_final\"]) + 1e-9)\n", + " r2 = abs(row[\"Q_kvar\"]) / (abs(row[\"Q_voltvar_min_final\"]) + 1e-9)\n", "\n", " # choose the smaller one, but keep track of which was chosen\n", " if r1 <= r2:\n", " chosen = r1\n", - " sign = np.sign(row['Q_voltvar_max_final']) * np.sign(row['Q_kvar'])\n", + " sign = np.sign(row[\"Q_voltvar_max_final\"]) * np.sign(row[\"Q_kvar\"])\n", " else:\n", " chosen = r2\n", - " sign = np.sign(row['Q_voltvar_min_final']) * np.sign(row['Q_kvar'])\n", + " sign = np.sign(row[\"Q_voltvar_min_final\"]) * np.sign(row[\"Q_kvar\"])\n", "\n", - " if row['Q_voltvar_max_final'] + row['Q_voltvar_min_final'] == 0:\n", + " if row[\"Q_voltvar_max_final\"] + row[\"Q_voltvar_min_final\"] == 0:\n", " sign = 1\n", " return chosen * sign" ] @@ -88,37 +91,38 @@ "metadata": {}, "outputs": [], "source": [ - "def get_voltvar_Q(V, Srated=1, v1=207, v2=220, v3=240, v4=258, Q1=.44, Q4=.60):\n", + "def get_voltvar_Q(V, Srated=1, v1=207, v2=220, v3=240, v4=258, Q1=0.44, Q4=0.60):\n", " if V <= v1:\n", - " Q = Q1* Srated\n", + " Q = Q1 * Srated\n", " elif v1 <= V < v2:\n", - " m = (Q1* Srated - 0) / (v1 - v2)\n", + " m = (Q1 * Srated - 0) / (v1 - v2)\n", " Q = m * (V - v2)\n", " elif v2 <= V <= v3:\n", " Q = float(0)\n", " elif v3 < V < v4:\n", - " m = (0 - Q4* Srated) / (v3 - v4)\n", - " Q = -m * (V - v4) - Q4* Srated\n", + " m = (0 - Q4 * Srated) / (v3 - v4)\n", + " Q = -m * (V - v4) - Q4 * Srated\n", " else: # V >= v4\n", - " Q = - Q4* Srated\n", + " Q = -Q4 * Srated\n", "\n", " return Q\n", "\n", + "\n", "def Q_capability_absorbing(P, Srated=1):\n", - " if abs(P) < .2 * Srated:\n", + " if abs(P) < 0.2 * Srated:\n", " Q = float(0)\n", - " elif abs(P) <= .6 * Srated:\n", - " Q = - 0.44 * Srated\n", - " elif abs(P) <= .8 * Srated:\n", + " elif abs(P) <= 0.6 * Srated:\n", + " Q = -0.44 * Srated\n", + " elif abs(P) <= 0.8 * Srated:\n", " S_pf = abs(P) / 0.8\n", - " val = S_pf**2 - abs(P)**2\n", - " if val < 0: # protect against negatives\n", + " val = S_pf**2 - abs(P) ** 2\n", + " if val < 0: # protect against negatives\n", " Q = float(0)\n", " else:\n", " Q = -math.sqrt(val)\n", " else:\n", - " val = Srated**2 - abs(P)**2\n", - " if val < 0: # protect against negatives\n", + " val = Srated**2 - abs(P) ** 2\n", + " if val < 0: # protect against negatives\n", " Q = float(0)\n", " else:\n", " Q = -math.sqrt(val)\n", @@ -489,17 +493,19 @@ } ], "source": [ - "v1=207\n", - "v2=220\n", - "v3=240\n", - "v4=258\n", + "v1 = 207\n", + "v2 = 220\n", + "v3 = 240\n", + "v4 = 258\n", "voltwatt_V = 253\n", - "Q1=.44\n", - "Q4=.60\n", + "Q1 = 0.44\n", + "Q4 = 0.60\n", "thr1 = -0.1\n", "thr2 = 0.1\n", - "thr3 = .9\n", + "thr3 = 0.9\n", "thr4 = 1.1\n", + "\n", + "\n", "def run_func(args):\n", " year, month, split_cons = args\n", " iceberg_exec(f\"\"\"\n", @@ -608,12 +614,23 @@ " group by year, month, day, day_night, site_id\n", " \"\"\")\n", " sleep(random.randint(5, 15))\n", - " print(f\"Completed year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\")\n", + " print(\n", + " f\"Completed year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\"\n", + " )\n", " return None\n", "\n", - "tasks = [(year, month, split_cons) for year in (2024, 2025) for month in range(1, 13) \n", - " for split_cons in ['system.bucket(postcode, 16) <= 3', '(system.bucket(postcode, 16) > 3 and system.bucket(postcode, 16) <= 7)', \n", - " '(system.bucket(postcode, 16) > 7 and system.bucket(postcode, 16) <= 11)', 'system.bucket(postcode, 16) > 11'] ]\n", + "\n", + "tasks = [\n", + " (year, month, split_cons)\n", + " for year in (2024, 2025)\n", + " for month in range(1, 13)\n", + " for split_cons in [\n", + " \"system.bucket(postcode, 16) <= 3\",\n", + " \"(system.bucket(postcode, 16) > 3 and system.bucket(postcode, 16) <= 7)\",\n", + " \"(system.bucket(postcode, 16) > 7 and system.bucket(postcode, 16) <= 11)\",\n", + " \"system.bucket(postcode, 16) > 11\",\n", + " ]\n", + "]\n", "trino_parallel_batch(run_func, tasks, num_workers=num_workers, batch_size=num_workers)" ] }, @@ -883,7 +900,7 @@ "iceberg_sql(f\"\"\"select year, month, count( site_id)/1000 from conformance_voltvar\n", " group by year, month\n", " order by year, month\n", - " \"\"\").round()\n" + " \"\"\").round()" ] }, { @@ -1264,7 +1281,7 @@ } ], "source": [ - "iceberg_sql('select * from conformance_voltvar limit 10')" + "iceberg_sql(\"select * from conformance_voltvar limit 10\")" ] }, { @@ -1485,7 +1502,7 @@ "iceberg_sql(f\"\"\"select year, month, count( site_id)/1000 from conformance_voltvar\n", " group by year, month\n", " order by year, month\n", - " \"\"\").round()\n" + " \"\"\").round()" ] }, { @@ -1541,7 +1558,7 @@ " group by circuit_id, t_stamp\n", " having count(t_stamp) > 1\n", " limit 10\n", - " \"\"\")\n" + " \"\"\")" ] }, { @@ -1637,7 +1654,7 @@ " from ts \n", " where year=2024 and month=10 and is_pv=True \n", " and circuit_id = 373631 and t_stamp = TIMESTAMP '2024-10-16 01:20:00'\n", - " \"\"\")\n" + " \"\"\")" ] }, { @@ -1728,7 +1745,7 @@ " from ts \n", " where year=2024 and month=10 and is_pv=True \n", " and circuit_id = 373631 and t_stamp = TIMESTAMP '2024-10-16 01:20:00'\n", - " \"\"\")\n" + " \"\"\")" ] }, { @@ -1781,7 +1798,9 @@ } ], "source": [ - "hive_sql(f\"\"\"select count( t_stamp) from ts where year=2024 and month=12 and is_pv=True limit 10\"\"\")\n" + "hive_sql(\n", + " f\"\"\"select count( t_stamp) from ts where year=2024 and month=12 and is_pv=True limit 10\"\"\"\n", + ")" ] }, { diff --git a/SolA2024_Analysis/Volt-Var.ipynb b/SolA2024_Analysis/Volt-Var.ipynb index 9586f32..07e963e 100644 --- a/SolA2024_Analysis/Volt-Var.ipynb +++ b/SolA2024_Analysis/Volt-Var.ipynb @@ -28,9 +28,11 @@ } ], "source": [ + "import numpy as np\n", + "\n", "from Data_query.spark_config import *\n", "from visualisation import *\n", - "import numpy as np\n", + "\n", "spark.catalog.setCurrentDatabase(\"solar_analytics\")\n", "warehouse_dir = spark.conf.get(\"spark.sql.warehouse.dir\")\n", "print(warehouse_dir)" @@ -123,41 +125,46 @@ "outputs": [], "source": [ "import math\n", - "def get_voltvar_Q(V, Srated=1, v1=207, v2=220, v3=240, v4=258, Q1=.44, Q4=.60):\n", + "\n", + "\n", + "def get_voltvar_Q(V, Srated=1, v1=207, v2=220, v3=240, v4=258, Q1=0.44, Q4=0.60):\n", " if V <= v1:\n", - " Q = Q1* Srated\n", + " Q = Q1 * Srated\n", " elif v1 <= V < v2:\n", - " m = (Q1* Srated - 0) / (v1 - v2)\n", + " m = (Q1 * Srated - 0) / (v1 - v2)\n", " Q = m * (V - v2)\n", " elif v2 <= V <= v3:\n", " Q = float(0)\n", " elif v3 < V < v4:\n", - " m = (0 - Q4* Srated) / (v3 - v4)\n", - " Q = -m * (V - v4) - Q4* Srated\n", + " m = (0 - Q4 * Srated) / (v3 - v4)\n", + " Q = -m * (V - v4) - Q4 * Srated\n", " else: # V >= v4\n", - " Q = - Q4* Srated\n", + " Q = -Q4 * Srated\n", "\n", " return Q\n", "\n", + "\n", "def Q_capability_absorbing(P, Srated=1):\n", - " if abs(P) < .2 * Srated:\n", + " if abs(P) < 0.2 * Srated:\n", " Q = float(0)\n", - " elif abs(P) <= .6 * Srated:\n", - " Q = - 0.44 * Srated\n", - " elif abs(P) <= .8 * Srated:\n", + " elif abs(P) <= 0.6 * Srated:\n", + " Q = -0.44 * Srated\n", + " elif abs(P) <= 0.8 * Srated:\n", " S_pf = abs(P) / 0.8\n", - " val = S_pf**2 - abs(P)**2\n", - " if val < 0: # protect against negatives\n", + " val = S_pf**2 - abs(P) ** 2\n", + " if val < 0: # protect against negatives\n", " Q = float(0)\n", " else:\n", " Q = -math.sqrt(val)\n", " else:\n", - " val = Srated**2 - abs(P)**2\n", - " if val < 0: # protect against negatives\n", + " val = Srated**2 - abs(P) ** 2\n", + " if val < 0: # protect against negatives\n", " Q = float(0)\n", " else:\n", " Q = -math.sqrt(val)\n", " return Q\n", + "\n", + "\n", "get_voltvar_Q_udf = udf(get_voltvar_Q, DoubleType())\n", "Q_capability_absorbing_udf = udf(Q_capability_absorbing, DoubleType())" ] @@ -172,9 +179,10 @@ "def Q_impact(Q_kvar, Q_voltvar_max, Q_voltvar_min):\n", " if Q_kvar is None or Q_voltvar_max is None or Q_voltvar_min is None:\n", " return None\n", + "\n", " def sign_val(x):\n", " return (x > 0) - (x < 0)\n", - " \n", + "\n", " eps = float(1e-9)\n", " r1 = abs(Q_kvar) / (abs(Q_voltvar_max) + eps)\n", " r2 = abs(Q_kvar) / (abs(Q_voltvar_min) + eps)\n", @@ -188,7 +196,9 @@ " if Q_voltvar_max + Q_voltvar_min == 0:\n", " sign = 1\n", " return chosen * sign\n", - "Q_impact_udf = udf(Q_impact, DoubleType())\n" + "\n", + "\n", + "Q_impact_udf = udf(Q_impact, DoubleType())" ] }, { @@ -209,7 +219,7 @@ } ], "source": [ - "get_voltvar_Q(247.6,110)" + "get_voltvar_Q(247.6, 110)" ] }, { @@ -232,8 +242,8 @@ "# ts = ts.withColumn('Q_kvar', col('energy_reactive') / 1000 * 12).drop('energy_reactive')\n", "\n", "# ts = ts.withColumn(\"day_night\", expr(\"\"\"\n", - "# CASE \n", - "# WHEN hour(from_utc_timestamp(t_stamp, 'UTC+10')) BETWEEN 9 AND 16 \n", + "# CASE\n", + "# WHEN hour(from_utc_timestamp(t_stamp, 'UTC+10')) BETWEEN 9 AND 16\n", "# THEN 'day'\n", "# ELSE 'night'\n", "# END\n", @@ -261,23 +271,23 @@ "# greatest(col(\"Q_voltvar_max\"), col(\"Q_capability_absorbing\")+ .04*df.ac_capacity_kw))\\\n", "# .otherwise(col(\"Q_voltvar_max\")))\n", "\n", - " \n", + "\n", "# df = df.withColumn(\"Q_voltvar_min\", when(col(\"Q_voltvar_min\") > lit(0), \\\n", "# least(col(\"Q_voltvar_min\"), col(\"Q_capability_supplying\")- .04*df.ac_capacity_kw))\\\n", "# .otherwise(col(\"Q_voltvar_min\")))\n", "\n", "# df = df.withColumn(\"extra_absorb\", when((col(\"Q_kvar\") < 0) & (col(\"Q_kvar\") < col(\"Q_voltvar_min\")), \\\n", "# col(\"Q_voltvar_min\") - col(\"Q_kvar\")).otherwise(lit(0)))\n", - " \n", + "\n", "# df = df.withColumn(\"deficit_absorb\", when((col(\"Q_kvar\") < 0) & (col(\"Q_kvar\") > col(\"Q_voltvar_max\")), \\\n", "# col(\"Q_kvar\") - col(\"Q_voltvar_max\")).otherwise(lit(0)))\n", - " \n", + "\n", "# df = df.withColumn(\"extra_supply\", when((col(\"Q_kvar\") > 0) & (col(\"Q_kvar\") > col(\"Q_voltvar_max\")), \\\n", "# col(\"Q_kvar\") - col(\"Q_voltvar_max\")).otherwise(lit(0)))\n", - " \n", + "\n", "# df = df.withColumn(\"deficit_supply\", when((col(\"Q_kvar\") > 0) & (col(\"Q_kvar\") < col(\"Q_voltvar_min\")), \\\n", "# col(\"Q_voltvar_min\") - col(\"Q_kvar\")).otherwise(lit(0)))\n", - " \n", + "\n", "# df = df.withColumn(\"noncompliance_voltvar\", col(\"extra_absorb\") + col(\"deficit_absorb\")+ col(\"extra_supply\")+ col(\"deficit_supply\"))\n", "\n", "# df = df.withColumn(\"day\", dayofmonth(col(\"t_stamp\")))\n", @@ -349,8 +359,8 @@ "# ts = ts.withColumn('Q_kvar', col('energy_reactive') / 1000 * 12).drop('energy_reactive')\n", "\n", "# ts = ts.withColumn(\"day_night\", expr(\"\"\"\n", - "# CASE \n", - "# WHEN hour(from_utc_timestamp(t_stamp, 'UTC+10')) BETWEEN 9 AND 16 \n", + "# CASE\n", + "# WHEN hour(from_utc_timestamp(t_stamp, 'UTC+10')) BETWEEN 9 AND 16\n", "# THEN 'day'\n", "# ELSE 'night'\n", "# END\n", @@ -379,7 +389,7 @@ "# greatest(col(\"Q_voltvar_max\"), col(\"Q_capability_absorbing\")+ .04*df.ac_capacity_kw))\\\n", "# .otherwise(col(\"Q_voltvar_max\")))\n", "\n", - " \n", + "\n", "# df = df.withColumn(\"Q_voltvar_min\", when(col(\"Q_voltvar_min\") > lit(0), \\\n", "# least(col(\"Q_voltvar_min\"), col(\"Q_capability_supplying\")- .04*df.ac_capacity_kw))\\\n", "# .otherwise(col(\"Q_voltvar_min\")))\n", @@ -400,7 +410,7 @@ "# df = df.withColumn(\"extra_absorb_ben_high\", when((col(\"Q_kvar\") < 0) & (col(\"Q_kvar\") < col(\"Q_voltvar_min\"))\\\n", "# & (col(\"Q_impact\") >= pos_threshold), \\\n", "# col(\"Q_voltvar_min\") - col(\"Q_kvar\")).otherwise(lit(0)))\n", - " \n", + "\n", "# df = df.withColumn(\"deficit_absorb_adv_high\", when((col(\"Q_kvar\") < 0) & (col(\"Q_kvar\") > col(\"Q_voltvar_max\"))\\\n", "# & (col(\"Q_impact\") < neg_threshold), \\\n", "# col(\"Q_kvar\") - col(\"Q_voltvar_max\")).otherwise(lit(0)))\n", @@ -424,7 +434,7 @@ "# df = df.withColumn(\"extra_supply_adv_low\", when((col(\"Q_kvar\") > 0) & (col(\"Q_kvar\") > col(\"Q_voltvar_max\"))\\\n", "# & (col(\"Q_impact\") >= neg_threshold) & (col(\"Q_impact\") < 0), \\\n", "# col(\"Q_kvar\") - col(\"Q_voltvar_max\")).otherwise(lit(0)))\n", - " \n", + "\n", "# df = df.withColumn(\"extra_supply_ben_low\", when((col(\"Q_kvar\") > 0) & (col(\"Q_kvar\") > col(\"Q_voltvar_max\"))\\\n", "# & (col(\"Q_impact\") >= 0) & (col(\"Q_impact\") < pos_threshold), \\\n", "# col(\"Q_kvar\") - col(\"Q_voltvar_max\")).otherwise(lit(0)))\n", @@ -432,7 +442,7 @@ "# df = df.withColumn(\"extra_supply_ben_high\", when((col(\"Q_kvar\") > 0) & (col(\"Q_kvar\") > col(\"Q_voltvar_max\"))\\\n", "# & (col(\"Q_impact\") >= pos_threshold), \\\n", "# col(\"Q_kvar\") - col(\"Q_voltvar_max\")).otherwise(lit(0)))\n", - " \n", + "\n", "# df = df.withColumn(\"deficit_supply_adv_high\", when((col(\"Q_kvar\") > 0) & (col(\"Q_kvar\") < col(\"Q_voltvar_min\"))\\\n", "# & (col(\"Q_impact\") < neg_threshold), \\\n", "# col(\"Q_voltvar_min\") - col(\"Q_kvar\")).otherwise(lit(0)))\n", @@ -448,7 +458,7 @@ "# df = df.withColumn(\"deficit_supply_ben_high\", when((col(\"Q_kvar\") > 0) & (col(\"Q_kvar\") < col(\"Q_voltvar_min\"))\\\n", "# & (col(\"Q_impact\") >= pos_threshold), \\\n", "# col(\"Q_voltvar_min\") - col(\"Q_kvar\")).otherwise(lit(0)))\n", - " \n", + "\n", "# df = df.withColumn(\"noncompliance_voltvar\", \\\n", "# col(\"extra_absorb_adv_high\") + col(\"extra_absorb_adv_low\") +col(\"extra_absorb_ben_high\") +col(\"extra_absorb_ben_low\") +\\\n", "# col(\"deficit_absorb_adv_high\")+ col(\"deficit_absorb_adv_low\")+ col(\"deficit_absorb_ben_high\")+ col(\"deficit_absorb_ben_low\")+ \\\n", @@ -518,103 +528,213 @@ "source": [ "first_write = True\n", "for year in (2024, 2025):\n", - "# for year in (2025, ):\n", + " # for year in (2025, ):\n", " # for month in (1, ):\n", " for month in range(1, 13):\n", - " ts = spark.read.table(\"ts\").filter(f\"is_pv = True and year = {year} and month = {month}\")\\\n", - " .select(\"circuit_id\", \"t_stamp\", \"power\", \"energy_reactive\", \"voltage\", \"year\", \"month\")\n", - " \n", - " ts = ts.withColumn('P_kW', col('power') / 1000).drop('power')\n", - " ts = ts.withColumn('Q_kvar', col('energy_reactive') / 1000 * 12).drop('energy_reactive')\n", + " ts = (\n", + " spark.read.table(\"ts\")\n", + " .filter(f\"is_pv = True and year = {year} and month = {month}\")\n", + " .select(\n", + " \"circuit_id\",\n", + " \"t_stamp\",\n", + " \"power\",\n", + " \"energy_reactive\",\n", + " \"voltage\",\n", + " \"year\",\n", + " \"month\",\n", + " )\n", + " )\n", + "\n", + " ts = ts.withColumn(\"P_kW\", col(\"power\") / 1000).drop(\"power\")\n", + " ts = ts.withColumn(\"Q_kvar\", col(\"energy_reactive\") / 1000 * 12).drop(\n", + " \"energy_reactive\"\n", + " )\n", "\n", - " ts = ts.withColumn(\"day_night\", expr(\"\"\"\n", + " ts = ts.withColumn(\n", + " \"day_night\",\n", + " expr(\"\"\"\n", " CASE \n", " WHEN hour(from_utc_timestamp(t_stamp, 'UTC+10')) BETWEEN 8 AND 16 \n", " THEN 'day'\n", " ELSE 'night'\n", " END\n", - " \"\"\"))\n", - "\n", - " meta = spark.read.table(\"meta_single_inverters_wrong_capacity_up2_3c\").select(\"circuit_id\",\"site_id\", \"s_id\",\"ac_capacity_kw\", 'circuit_polarity', 'wrong_capacity')\\\n", - " .filter(col(\"wrong_capacity\") == False).filter(col(\"ac_capacity_kw\").isNotNull()).drop(\"wrong_capacity\")\\\n", - " .withColumnRenamed(\"circuit_id\", \"circuit_id_meta\")\n", - " df = ts.join(meta, ts.circuit_id == meta.circuit_id_meta, \"inner\").drop(\"circuit_id_meta\")\n", - " df = df.withColumn('P_kW', col('P_kW') * col('circuit_polarity'))\n", - " df = df.withColumn('Q_kvar', col('Q_kvar') * col('circuit_polarity')).drop('circuit_polarity')\n", - " df = df.groupBy(\"site_id\", \"s_id\",\"t_stamp\", \"year\", \"month\", \"day_night\").agg(\n", + " \"\"\"),\n", + " )\n", + "\n", + " meta = (\n", + " spark.read.table(\"meta_single_inverters_wrong_capacity_up2_3c\")\n", + " .select(\n", + " \"circuit_id\",\n", + " \"site_id\",\n", + " \"s_id\",\n", + " \"ac_capacity_kw\",\n", + " \"circuit_polarity\",\n", + " \"wrong_capacity\",\n", + " )\n", + " .filter(col(\"wrong_capacity\") == False)\n", + " .filter(col(\"ac_capacity_kw\").isNotNull())\n", + " .drop(\"wrong_capacity\")\n", + " .withColumnRenamed(\"circuit_id\", \"circuit_id_meta\")\n", + " )\n", + " df = ts.join(meta, ts.circuit_id == meta.circuit_id_meta, \"inner\").drop(\n", + " \"circuit_id_meta\"\n", + " )\n", + " df = df.withColumn(\"P_kW\", col(\"P_kW\") * col(\"circuit_polarity\"))\n", + " df = df.withColumn(\"Q_kvar\", col(\"Q_kvar\") * col(\"circuit_polarity\")).drop(\n", + " \"circuit_polarity\"\n", + " )\n", + " df = df.groupBy(\"site_id\", \"s_id\", \"t_stamp\", \"year\", \"month\", \"day_night\").agg(\n", " spark_sum(\"P_kW\").alias(\"P_kW\"),\n", " spark_sum(\"Q_kvar\").alias(\"Q_kvar\"),\n", " avg(\"voltage\").alias(\"voltage\"),\n", - " avg(\"ac_capacity_kw\").alias(\"ac_capacity_kw\")\n", + " avg(\"ac_capacity_kw\").alias(\"ac_capacity_kw\"),\n", + " )\n", + " df = df.withColumn(\n", + " \"voltage\",\n", + " when((col(\"voltage\") > 300) | (col(\"voltage\") < 0), None).otherwise(\n", + " col(\"voltage\")\n", + " ),\n", " )\n", - " df = df.withColumn('voltage', when((col('voltage') > 300) | (col('voltage') < 0), None).otherwise(col('voltage')))\n", " df = df.filter(col(\"voltage\").isNotNull())\n", - " df = df.withColumn(\"PF\", col(\"P_kW\") / (col(\"P_kW\")**2 + col(\"Q_kvar\")**2)**0.5)\n", - " df = df.withColumn(\"Q_voltvar\", get_voltvar_Q_udf(df.voltage, df.ac_capacity_kw))\n", - " df = df.withColumn(\"Q_voltvar_max\", col(\"Q_voltvar\").cast(DoubleType()) + .04*df.ac_capacity_kw)\n", - " df = df.withColumn(\"Q_voltvar_min\", col(\"Q_voltvar\").cast(DoubleType()) - .04*df.ac_capacity_kw)\n", - " df = df.withColumn(\"Q_capability_absorbing\", Q_capability_absorbing_udf(df.P_kW, df.ac_capacity_kw))\n", + " df = df.withColumn(\n", + " \"PF\", col(\"P_kW\") / (col(\"P_kW\") ** 2 + col(\"Q_kvar\") ** 2) ** 0.5\n", + " )\n", + " df = df.withColumn(\n", + " \"Q_voltvar\", get_voltvar_Q_udf(df.voltage, df.ac_capacity_kw)\n", + " )\n", + " df = df.withColumn(\n", + " \"Q_voltvar_max\",\n", + " col(\"Q_voltvar\").cast(DoubleType()) + 0.04 * df.ac_capacity_kw,\n", + " )\n", + " df = df.withColumn(\n", + " \"Q_voltvar_min\",\n", + " col(\"Q_voltvar\").cast(DoubleType()) - 0.04 * df.ac_capacity_kw,\n", + " )\n", + " df = df.withColumn(\n", + " \"Q_capability_absorbing\",\n", + " Q_capability_absorbing_udf(df.P_kW, df.ac_capacity_kw),\n", + " )\n", " df = df.withColumn(\"Q_capability_supplying\", col(\"Q_capability_absorbing\") * -1)\n", "\n", - " df = df.withColumn(\"Q_voltvar_max\", when(col(\"Q_voltvar_max\") < lit(0), \\\n", - " greatest(col(\"Q_voltvar_max\"), col(\"Q_capability_absorbing\")+ .04*df.ac_capacity_kw))\\\n", - " .otherwise(col(\"Q_voltvar_max\")))\n", + " df = df.withColumn(\n", + " \"Q_voltvar_max\",\n", + " when(\n", + " col(\"Q_voltvar_max\") < lit(0),\n", + " greatest(\n", + " col(\"Q_voltvar_max\"),\n", + " col(\"Q_capability_absorbing\") + 0.04 * df.ac_capacity_kw,\n", + " ),\n", + " ).otherwise(col(\"Q_voltvar_max\")),\n", + " )\n", "\n", - " \n", - " df = df.withColumn(\"Q_voltvar_min\", when(col(\"Q_voltvar_min\") > lit(0), \\\n", - " least(col(\"Q_voltvar_min\"), col(\"Q_capability_supplying\")- .04*df.ac_capacity_kw))\\\n", - " .otherwise(col(\"Q_voltvar_min\")))\n", + " df = df.withColumn(\n", + " \"Q_voltvar_min\",\n", + " when(\n", + " col(\"Q_voltvar_min\") > lit(0),\n", + " least(\n", + " col(\"Q_voltvar_min\"),\n", + " col(\"Q_capability_supplying\") - 0.04 * df.ac_capacity_kw,\n", + " ),\n", + " ).otherwise(col(\"Q_voltvar_min\")),\n", + " )\n", "\n", - " df = df.withColumn(\"Q_impact\", Q_impact_udf(df.Q_kvar, df.Q_voltvar_max, df.Q_voltvar_min))\n", + " df = df.withColumn(\n", + " \"Q_impact\", Q_impact_udf(df.Q_kvar, df.Q_voltvar_max, df.Q_voltvar_min)\n", + " )\n", "\n", " thr1 = -0.1\n", " thr2 = 0.1\n", - " thr3 = .9\n", + " thr3 = 0.9\n", " thr4 = 1.1\n", - " outside_bounds = (col('Q_kvar') < col(\"Q_voltvar_min\")) | (col('Q_kvar') > col(\"Q_voltvar_max\"))\n", - " noncompliance_value = least(spark_abs(col('Q_kvar') - col('Q_voltvar_min')), spark_abs(col('Q_kvar') - col('Q_voltvar_max')))\n", - " \n", - " df = df.withColumn(\"Q_adverse\", when(outside_bounds & (col(\"Q_impact\") < thr1), \\\n", - " noncompliance_value).otherwise(lit(0)))\n", - " \n", - " df = df.withColumn(\"Q_inactive\", when(outside_bounds & (col(\"Q_impact\") >= thr1) & (col(\"Q_impact\") <= thr2), \\\n", - " noncompliance_value).otherwise(lit(0)))\n", - " \n", - " df = df.withColumn(\"Q_minor\", when(outside_bounds & (col(\"Q_impact\") > thr2) & (col(\"Q_impact\") < thr3), \\\n", - " noncompliance_value).otherwise(lit(0)))\n", - "\n", - " df = df.withColumn(\"Q_major\", when(outside_bounds & (col(\"Q_impact\") >= thr3) & (col(\"Q_impact\") <= thr4), \\\n", - " noncompliance_value).otherwise(lit(0)))\n", - "\n", - " df = df.withColumn(\"Q_benevolent\", when(outside_bounds & (col(\"Q_impact\") > thr4), \\\n", - " noncompliance_value).otherwise(lit(0)))\n", - " \n", - " df = df.withColumn(\"noncompliance_voltvar\", \\\n", - " col(\"Q_adverse\") + col(\"Q_inactive\") +col(\"Q_minor\") +col(\"Q_major\") +\\\n", - " col(\"Q_benevolent\"))\n", + " outside_bounds = (col(\"Q_kvar\") < col(\"Q_voltvar_min\")) | (\n", + " col(\"Q_kvar\") > col(\"Q_voltvar_max\")\n", + " )\n", + " noncompliance_value = least(\n", + " spark_abs(col(\"Q_kvar\") - col(\"Q_voltvar_min\")),\n", + " spark_abs(col(\"Q_kvar\") - col(\"Q_voltvar_max\")),\n", + " )\n", + "\n", + " df = df.withColumn(\n", + " \"Q_adverse\",\n", + " when(\n", + " outside_bounds & (col(\"Q_impact\") < thr1), noncompliance_value\n", + " ).otherwise(lit(0)),\n", + " )\n", + "\n", + " df = df.withColumn(\n", + " \"Q_inactive\",\n", + " when(\n", + " outside_bounds & (col(\"Q_impact\") >= thr1) & (col(\"Q_impact\") <= thr2),\n", + " noncompliance_value,\n", + " ).otherwise(lit(0)),\n", + " )\n", + "\n", + " df = df.withColumn(\n", + " \"Q_minor\",\n", + " when(\n", + " outside_bounds & (col(\"Q_impact\") > thr2) & (col(\"Q_impact\") < thr3),\n", + " noncompliance_value,\n", + " ).otherwise(lit(0)),\n", + " )\n", + "\n", + " df = df.withColumn(\n", + " \"Q_major\",\n", + " when(\n", + " outside_bounds & (col(\"Q_impact\") >= thr3) & (col(\"Q_impact\") <= thr4),\n", + " noncompliance_value,\n", + " ).otherwise(lit(0)),\n", + " )\n", + "\n", + " df = df.withColumn(\n", + " \"Q_benevolent\",\n", + " when(\n", + " outside_bounds & (col(\"Q_impact\") > thr4), noncompliance_value\n", + " ).otherwise(lit(0)),\n", + " )\n", + "\n", + " df = df.withColumn(\n", + " \"noncompliance_voltvar\",\n", + " col(\"Q_adverse\")\n", + " + col(\"Q_inactive\")\n", + " + col(\"Q_minor\")\n", + " + col(\"Q_major\")\n", + " + col(\"Q_benevolent\"),\n", + " )\n", "\n", " df = df.withColumn(\"day\", dayofmonth(col(\"t_stamp\")))\n", - " comp = df.groupBy('year','month',\"day\", \"site_id\", \"s_id\", \"day_night\")\\\n", - " .agg(spark_sum(when(col(\"noncompliance_voltvar\") > 0, 1).otherwise(0)).alias(\"noncompliance_voltvar_count\"), \\\n", + " comp = df.groupBy(\"year\", \"month\", \"day\", \"site_id\", \"s_id\", \"day_night\").agg(\n", + " spark_sum(when(col(\"noncompliance_voltvar\") > 0, 1).otherwise(0)).alias(\n", + " \"noncompliance_voltvar_count\"\n", + " ),\n", " spark_sum(col(\"noncompliance_voltvar\")).alias(\"noncompliance_voltvar_sum\"),\n", - " spark_sum(col(\"Q_adverse\")).alias(\"Q_adverse\"),\n", - " spark_sum(col(\"Q_inactive\")).alias(\"Q_inactive\"),\n", - " spark_sum(col(\"Q_minor\")).alias(\"Q_minor\"),\n", - " spark_sum(col(\"Q_major\")).alias(\"Q_major\"),\n", - " spark_sum(col(\"Q_benevolent\")).alias(\"Q_benevolent\"),\n", - " spark_count(col(\"noncompliance_voltvar\")).alias(\"total_count\"),\n", - " spark_sum(when(col(\"Q_adverse\") > 0, 1).otherwise(0)).alias(\"Q_adverse_count\"), \\\n", - " spark_sum(when(col(\"Q_inactive\") > 0, 1).otherwise(0)).alias(\"Q_inactive_count\"), \\\n", - " spark_sum(when(col(\"Q_minor\") > 0, 1).otherwise(0)).alias(\"Q_minor_count\"), \\\n", - " spark_sum(when(col(\"Q_major\") > 0, 1).otherwise(0)).alias(\"Q_major_count\"), \\\n", - " spark_sum(when(col(\"Q_benevolent\") > 0, 1).otherwise(0)).alias(\"Q_benevolent_count\")\n", - " )\\\n", - " # .sort(\"noncompliance_voltvar_count\", ascending=False)\n", + " spark_sum(col(\"Q_adverse\")).alias(\"Q_adverse\"),\n", + " spark_sum(col(\"Q_inactive\")).alias(\"Q_inactive\"),\n", + " spark_sum(col(\"Q_minor\")).alias(\"Q_minor\"),\n", + " spark_sum(col(\"Q_major\")).alias(\"Q_major\"),\n", + " spark_sum(col(\"Q_benevolent\")).alias(\"Q_benevolent\"),\n", + " spark_count(col(\"noncompliance_voltvar\")).alias(\"total_count\"),\n", + " spark_sum(when(col(\"Q_adverse\") > 0, 1).otherwise(0)).alias(\n", + " \"Q_adverse_count\"\n", + " ),\n", + " spark_sum(when(col(\"Q_inactive\") > 0, 1).otherwise(0)).alias(\n", + " \"Q_inactive_count\"\n", + " ),\n", + " spark_sum(when(col(\"Q_minor\") > 0, 1).otherwise(0)).alias(\"Q_minor_count\"),\n", + " spark_sum(when(col(\"Q_major\") > 0, 1).otherwise(0)).alias(\"Q_major_count\"),\n", + " spark_sum(when(col(\"Q_benevolent\") > 0, 1).otherwise(0)).alias(\n", + " \"Q_benevolent_count\"\n", + " ),\n", + " )\n", + " # .sort(\"noncompliance_voltvar_count\", ascending=False)\n", " if first_write:\n", - " comp.write.mode(\"overwrite\").parquet(f\"{warehouse_dir}/Compliance_results_SolA/compliance_voltvar.parquet\")\n", + " comp.write.mode(\"overwrite\").parquet(\n", + " f\"{warehouse_dir}/Compliance_results_SolA/compliance_voltvar.parquet\"\n", + " )\n", " first_write = False\n", " else:\n", - " comp.write.mode(\"append\").parquet(f\"{warehouse_dir}/Compliance_results_SolA/compliance_voltvar.parquet\")" + " comp.write.mode(\"append\").parquet(\n", + " f\"{warehouse_dir}/Compliance_results_SolA/compliance_voltvar.parquet\"\n", + " )" ] }, { @@ -660,7 +780,11 @@ } ], "source": [ - "df.filter((col(\"noncompliance_voltvar\") == lit(0)) & (col(\"voltage\") > lit(250)) & (col(\"P_kW\") > lit(1))).show(10, truncate=False)" + "df.filter(\n", + " (col(\"noncompliance_voltvar\") == lit(0))\n", + " & (col(\"voltage\") > lit(250))\n", + " & (col(\"P_kW\") > lit(1))\n", + ").show(10, truncate=False)" ] }, { @@ -743,8 +867,9 @@ } ], "source": [ - "spark.read.table(\"ts\").filter(f\"is_pv = True and year = 2024 and month =1\")\\\n", - " .select(\"circuit_id\", \"t_stamp\", \"power\", \"energy_reactive\", \"voltage\", \"year\", \"month\").show(1)" + "spark.read.table(\"ts\").filter(f\"is_pv = True and year = 2024 and month =1\").select(\n", + " \"circuit_id\", \"t_stamp\", \"power\", \"energy_reactive\", \"voltage\", \"year\", \"month\"\n", + ").show(1)" ] }, { @@ -847,8 +972,19 @@ } ], "source": [ - "d = d.query(f\"t_stamp=='2024-01-18 00:40:00'\")#,\"Q_voltvar_min\", \"Q_voltvar_max\"\n", - "d[[\"t_stamp\",'P_kW', \"Q_kvar\", 'voltage','ac_capacity_kw','Q_voltvar',\"Q_voltvar_min\", \"Q_voltvar_max\"]]" + "d = d.query(f\"t_stamp=='2024-01-18 00:40:00'\") # ,\"Q_voltvar_min\", \"Q_voltvar_max\"\n", + "d[\n", + " [\n", + " \"t_stamp\",\n", + " \"P_kW\",\n", + " \"Q_kvar\",\n", + " \"voltage\",\n", + " \"ac_capacity_kw\",\n", + " \"Q_voltvar\",\n", + " \"Q_voltvar_min\",\n", + " \"Q_voltvar_max\",\n", + " ]\n", + "]" ] }, { @@ -876,7 +1012,9 @@ } ], "source": [ - "get_voltvar_Q(238.05, 2.828440, Srated=5, v1=207, v2=220, v3=240, v4=258, Q1=.44, Q4=.60)" + "get_voltvar_Q(\n", + " 238.05, 2.828440, Srated=5, v1=207, v2=220, v3=240, v4=258, Q1=0.44, Q4=0.60\n", + ")" ] }, { @@ -974,7 +1112,9 @@ "metadata": {}, "outputs": [], "source": [ - "df = spark.read.parquet(f\"{warehouse_dir}/Compliance_results_SolA/compliance_voltvar.parquet\")" + "df = spark.read.parquet(\n", + " f\"{warehouse_dir}/Compliance_results_SolA/compliance_voltvar.parquet\"\n", + ")" ] }, { @@ -1144,7 +1284,7 @@ } ], "source": [ - "230*.9" + "230 * 0.9" ] }, { @@ -1268,13 +1408,15 @@ } ], "source": [ - "spark.sql(\"with circuit_counts as ( \\\n", + "spark.sql(\n", + " \"with circuit_counts as ( \\\n", " select site_id, count( circuit_id) as num_circuits from meta_single_inverters_wrong_capacity \\\n", " where wrong_capacity = False \\\n", " group by site_id \\\n", " ) \\\n", " select count(site_id) from circuit_counts \\\n", - " where num_circuits > 3\").show(30, truncate=False)" + " where num_circuits > 3\"\n", + ").show(30, truncate=False)" ] }, { @@ -1326,9 +1468,11 @@ } ], "source": [ - "spark.sql(\"select * from meta_single_inverters_wrong_capacity where site_id in ( \\\n", + "spark.sql(\n", + " \"select * from meta_single_inverters_wrong_capacity where site_id in ( \\\n", " select site_id from meta_single_inverters_wrong_capacity group by site_id having count(circuit_id) <= 3 \\\n", - " )\").show(30, truncate=False)" + " )\"\n", + ").show(30, truncate=False)" ] }, { @@ -1351,8 +1495,10 @@ } ], "source": [ - "spark.sql(\"select * from meta_single_inverters_wrong_capacity \\\n", - " where site_id = 25610840\").show(10, truncate=False)" + "spark.sql(\n", + " \"select * from meta_single_inverters_wrong_capacity \\\n", + " where site_id = 25610840\"\n", + ").show(10, truncate=False)" ] }, { @@ -1375,8 +1521,10 @@ } ], "source": [ - "spark.sql(\"select * from sites \\\n", - " where site_id = 25610840\").show(10, truncate=False)" + "spark.sql(\n", + " \"select * from sites \\\n", + " where site_id = 25610840\"\n", + ").show(10, truncate=False)" ] }, { @@ -1395,7 +1543,9 @@ ], "source": [ "dfp = df.filter(\"site_id = 25610840\").toPandas()\n", - "dfp['t_stamp'] = dfp['t_stamp'].dt.tz_localize('UTC').dt.tz_convert(pytz.FixedOffset(10*60)) # Convert to UTC+110" + "dfp[\"t_stamp\"] = (\n", + " dfp[\"t_stamp\"].dt.tz_localize(\"UTC\").dt.tz_convert(pytz.FixedOffset(10 * 60))\n", + ") # Convert to UTC+110" ] }, { @@ -1426,31 +1576,71 @@ } ], "source": [ - "start_time = '2024-01-08 00:00:00+10:00'\t # In sydney local time\n", - "end_time = '2024-01-09 00:00:00+10:00'\t # In sydney local time\n", + "start_time = \"2024-01-08 00:00:00+10:00\" # In sydney local time\n", + "end_time = \"2024-01-09 00:00:00+10:00\" # In sydney local time\n", "\n", "\n", - "num_ticks = 24*2+1\n", - "save_as = ''\n", - "x_label = 'time'\n", - "y_labels = ['Reactive power (kvar)', 'Reactive power (kvar)', 'Reactive power (kvar)', 'voltage (V)', 'Active power (kW)']\n", - "plt_config = {'Q_voltvar_max': [0, 0, '-.', None, None], 'Q_voltvar_min': [0, 0, '-.', None, None],'Q_kvar': [0, 0, '-', None, None],\n", - "'voltage': [0, 1, '-', None, None], 'P_kW': [1, 0, '-', None, None]}\n", + "num_ticks = 24 * 2 + 1\n", + "save_as = \"\"\n", + "x_label = \"time\"\n", + "y_labels = [\n", + " \"Reactive power (kvar)\",\n", + " \"Reactive power (kvar)\",\n", + " \"Reactive power (kvar)\",\n", + " \"voltage (V)\",\n", + " \"Active power (kW)\",\n", + "]\n", + "plt_config = {\n", + " \"Q_voltvar_max\": [0, 0, \"-.\", None, None],\n", + " \"Q_voltvar_min\": [0, 0, \"-.\", None, None],\n", + " \"Q_kvar\": [0, 0, \"-\", None, None],\n", + " \"voltage\": [0, 1, \"-\", None, None],\n", + " \"P_kW\": [1, 0, \"-\", None, None],\n", + "}\n", "# y_labels = ['Reactive power (kvar)', 'Reactive power (kvar)', 'voltage (V)', 'Active power (kW)']\n", "# plt_config = {'Q_voltvar_max': [0, 0, '-.', None, None], 'Q_voltvar_min': [0, 0, '-.', None, None],\n", "# 'voltage': [0, 1, '-', None, None], 'P_kW': [1, 0, '-', None, None]}\n", "\n", - "color_nights=False\n", + "color_nights = False\n", "# color_by = 'group'\n", - "color_by = 'attribute'\n", - "ax_digit = '1.1f'\n", - "a=my_plot4(start_time, end_time, dfp, plt_config=plt_config, ax_digit= ax_digit,\n", - " group_attr='site_id', time_attr='t_stamp', color_nights=color_nights,cmap='plasma',\n", - " figsize=[16/2.54,2.5], same_scale=1, fontsize=7, fontname='DejaVu Sans', plot_total=False, plot_total_func=['sum', [lambda x: max(x), 'max']], \n", - " num_ticks=num_ticks, num_yticks=10, dpi=300, x_format= '%H:%M', \n", - " legend_loc=['lower left', 'upper right', 'upper right'], x_label=x_label, y_labels=y_labels, color_by=color_by,\n", - "plot_period=np.timedelta64(1, 'D'), save_as=save_as, rotation = 60, step=0, gridwidth=[0.2, .2], legend_join='-', title='', \n", - "legend_i=0, title_i=0, only1title=1, onlyntime=1)\n", + "color_by = \"attribute\"\n", + "ax_digit = \"1.1f\"\n", + "a = my_plot4(\n", + " start_time,\n", + " end_time,\n", + " dfp,\n", + " plt_config=plt_config,\n", + " ax_digit=ax_digit,\n", + " group_attr=\"site_id\",\n", + " time_attr=\"t_stamp\",\n", + " color_nights=color_nights,\n", + " cmap=\"plasma\",\n", + " figsize=[16 / 2.54, 2.5],\n", + " same_scale=1,\n", + " fontsize=7,\n", + " fontname=\"DejaVu Sans\",\n", + " plot_total=False,\n", + " plot_total_func=[\"sum\", [lambda x: max(x), \"max\"]],\n", + " num_ticks=num_ticks,\n", + " num_yticks=10,\n", + " dpi=300,\n", + " x_format=\"%H:%M\",\n", + " legend_loc=[\"lower left\", \"upper right\", \"upper right\"],\n", + " x_label=x_label,\n", + " y_labels=y_labels,\n", + " color_by=color_by,\n", + " plot_period=np.timedelta64(1, \"D\"),\n", + " save_as=save_as,\n", + " rotation=60,\n", + " step=0,\n", + " gridwidth=[0.2, 0.2],\n", + " legend_join=\"-\",\n", + " title=\"\",\n", + " legend_i=0,\n", + " title_i=0,\n", + " only1title=1,\n", + " onlyntime=1,\n", + ")\n", "a.do()" ] }, @@ -1527,7 +1717,9 @@ } ], "source": [ - "df.filter(col(\"noncompliance_voltvar\") > 0).select(\"circuit_id\").show(40, truncate=False)" + "df.filter(col(\"noncompliance_voltvar\") > 0).select(\"circuit_id\").show(\n", + " 40, truncate=False\n", + ")" ] }, { @@ -1564,11 +1756,15 @@ } ], "source": [ - "dfv = spark.read.table(\"ts\").filter(\"is_pv = True \")\\\n", - " .select(\"circuit_id\", \"t_stamp\", \"voltage\").filter(\"voltage < 0\")\n", - "dfv.groupBy(\"circuit_id\").agg(\n", - " spark_count(\"voltage\").alias(\"count_above_300\")\n", - ").sort(\"count_above_300\", ascending=False).show(truncate=False)" + "dfv = (\n", + " spark.read.table(\"ts\")\n", + " .filter(\"is_pv = True \")\n", + " .select(\"circuit_id\", \"t_stamp\", \"voltage\")\n", + " .filter(\"voltage < 0\")\n", + ")\n", + "dfv.groupBy(\"circuit_id\").agg(spark_count(\"voltage\").alias(\"count_above_300\")).sort(\n", + " \"count_above_300\", ascending=False\n", + ").show(truncate=False)" ] }, { @@ -1624,14 +1820,19 @@ } ], "source": [ - "df = spark.read.table(\"ts\").filter(\"is_pv = True and year = 2024 and month=1\").select(\"circuit_id\", \"t_stamp\")\n", + "df = (\n", + " spark.read.table(\"ts\")\n", + " .filter(\"is_pv = True and year = 2024 and month=1\")\n", + " .select(\"circuit_id\", \"t_stamp\")\n", + ")\n", "\n", "# Define a window partitioned by circuit_id and ordered by timestamp\n", "w = Window.partitionBy(\"circuit_id\").orderBy(\"t_stamp\")\n", "\n", "# Compute difference with previous row in seconds\n", - "df_diff = df.withColumn(\"prev_t_stamp\", lag(\"t_stamp\").over(w)) \\\n", - " .withColumn(\"diff_seconds\", (col(\"t_stamp\").cast(\"long\") - col(\"prev_t_stamp\").cast(\"long\")))\n", + "df_diff = df.withColumn(\"prev_t_stamp\", lag(\"t_stamp\").over(w)).withColumn(\n", + " \"diff_seconds\", (col(\"t_stamp\").cast(\"long\") - col(\"prev_t_stamp\").cast(\"long\"))\n", + ")\n", "\n", "# Show distinct differences per circuit\n", "min_diff_per_circuit = df_diff.groupBy(\"circuit_id\").agg(\n", diff --git a/SolA2024_Analysis/Volt-Watt-Trino.ipynb b/SolA2024_Analysis/Volt-Watt-Trino.ipynb index 67abbae..320f7a7 100644 --- a/SolA2024_Analysis/Volt-Watt-Trino.ipynb +++ b/SolA2024_Analysis/Volt-Watt-Trino.ipynb @@ -7,12 +7,15 @@ "metadata": {}, "outputs": [], "source": [ - "from Data_query.trino_config import *\n", - "import numpy as np\n", - "import boto3\n", "import random\n", + "\n", + "import boto3\n", + "import numpy as np\n", + "\n", + "from Data_query.trino_config import *\n", + "\n", "session = boto3.Session()\n", - "s3 = boto3.client('s3')" + "s3 = boto3.client(\"s3\")" ] }, { @@ -30,7 +33,7 @@ } ], "source": [ - "stop_trino()\n" + "stop_trino()" ] }, { @@ -42,7 +45,7 @@ "source": [ "big_workers = 1\n", "workers = 0\n", - "num_workers = max(workers, big_workers)\n" + "num_workers = max(workers, big_workers)" ] }, { @@ -62,8 +65,8 @@ } ], "source": [ - "ensure_trino_running(worker_desired_count = workers, big_worker_desired_count=big_workers)\n", - "sleep(90) # wait for trino to be ready\n" + "ensure_trino_running(worker_desired_count=workers, big_worker_desired_count=big_workers)\n", + "sleep(90) # wait for trino to be ready" ] }, { @@ -206,8 +209,10 @@ } ], "source": [ - "v1=253\n", - "v2=260\n", + "v1 = 253\n", + "v2 = 260\n", + "\n", + "\n", "def run_func(args):\n", " year, month, split_cons = args\n", " iceberg_exec(f\"\"\"\n", @@ -242,12 +247,24 @@ " group by year, month, day, site_id\n", " order by nonconformance_voltwatt_sum desc\n", " \"\"\")\n", - " sleep(random.randint(1, 10)) # add some randomness to avoid overwhelming trino with simultaneous queries\n", - " print(f\"Completed year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\")\n", + " sleep(\n", + " random.randint(1, 10)\n", + " ) # add some randomness to avoid overwhelming trino with simultaneous queries\n", + " print(\n", + " f\"Completed year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\"\n", + " )\n", " return None\n", "\n", - "tasks = [(year, month, split_cons) for year in (2024, 2025) for month in range(1, 13) \n", - " for split_cons in ['system.bucket(postcode, 16) <= 7', 'system.bucket(postcode, 16) > 7'] ]\n", + "\n", + "tasks = [\n", + " (year, month, split_cons)\n", + " for year in (2024, 2025)\n", + " for month in range(1, 13)\n", + " for split_cons in [\n", + " \"system.bucket(postcode, 16) <= 7\",\n", + " \"system.bucket(postcode, 16) > 7\",\n", + " ]\n", + "]\n", "trino_parallel(run_func, tasks, num_workers=num_workers)" ] }, @@ -450,10 +467,10 @@ " if V < v1:\n", " return Srated\n", " elif V > v2:\n", - " return .2 * Srated\n", + " return 0.2 * Srated\n", " else:\n", - " m = (Srated - .2*Srated) / (v1 - v2)\n", - " P = m * (V - v2) + .2*Srated\n", + " m = (Srated - 0.2 * Srated) / (v1 - v2)\n", + " P = m * (V - v2) + 0.2 * Srated\n", " return P" ] }, @@ -630,8 +647,8 @@ } ], "source": [ - "v1=253\n", - "v2=260\n", + "v1 = 253\n", + "v2 = 260\n", "iceberg_sql(f\"\"\"\n", " with data as (\n", " select site_id, t_stamp, sum(power*circuit_polarity/1000) as P_kW, \n", @@ -735,7 +752,9 @@ } ], "source": [ - "iceberg_sql(\"select circuit_id, circuit_polarity from meta_up23c where circuit_id = 467634\")" + "iceberg_sql(\n", + " \"select circuit_id, circuit_polarity from meta_up23c where circuit_id = 467634\"\n", + ")" ] }, { diff --git a/SolA2024_Analysis/Volt-Watt-ghi.ipynb b/SolA2024_Analysis/Volt-Watt-ghi.ipynb index 2195710..9cd6e01 100644 --- a/SolA2024_Analysis/Volt-Watt-ghi.ipynb +++ b/SolA2024_Analysis/Volt-Watt-ghi.ipynb @@ -7,13 +7,16 @@ "metadata": {}, "outputs": [], "source": [ - "from Data_query.trino_config import *\n", - "import numpy as np\n", - "import boto3\n", "import random\n", + "\n", + "import boto3\n", + "import numpy as np\n", "import pytz\n", + "\n", + "from Data_query.trino_config import *\n", + "\n", "session = boto3.Session()\n", - "s3 = boto3.client('s3')" + "s3 = boto3.client(\"s3\")" ] }, { @@ -31,7 +34,7 @@ } ], "source": [ - "stop_trino()\n" + "stop_trino()" ] }, { @@ -64,8 +67,8 @@ "big_workers = 1\n", "workers = 0\n", "num_workers = max(workers, big_workers)\n", - "ensure_trino_running(worker_desired_count = workers, big_worker_desired_count=big_workers)\n", - "sleep(60) # wait for trino to be ready\n" + "ensure_trino_running(worker_desired_count=workers, big_worker_desired_count=big_workers)\n", + "sleep(60) # wait for trino to be ready" ] }, { @@ -308,8 +311,10 @@ } ], "source": [ - "v1=253\n", - "v2=260\n", + "v1 = 253\n", + "v2 = 260\n", + "\n", + "\n", "def run_func(args):\n", " year, month, split_cons = args\n", " df = iceberg_sql(f\"\"\"\n", @@ -348,12 +353,24 @@ " order by nonconformance_voltwattghi_sum desc\n", " \"\"\")\n", " # sleep(random.randint(1, 10)) # add some randomness to avoid overwhelming trino with simultaneous queries\n", - " print(f\"Completed year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\")\n", + " print(\n", + " f\"Completed year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\"\n", + " )\n", " return df\n", "\n", - "tasks = [(year, month, split_cons) for year in (2024, ) for month in range(1, 13) \n", - " for split_cons in ['system.bucket(postcode, 16) <= 7', 'system.bucket(postcode, 16) > 7'] ]\n", - "df = trino_parallel_batch(run_func, tasks, num_workers=num_workers, batch_size=num_workers)\n", + "\n", + "tasks = [\n", + " (year, month, split_cons)\n", + " for year in (2024,)\n", + " for month in range(1, 13)\n", + " for split_cons in [\n", + " \"system.bucket(postcode, 16) <= 7\",\n", + " \"system.bucket(postcode, 16) > 7\",\n", + " ]\n", + "]\n", + "df = trino_parallel_batch(\n", + " run_func, tasks, num_workers=num_workers, batch_size=num_workers\n", + ")\n", "df" ] }, @@ -650,8 +667,10 @@ } ], "source": [ - "v1=253\n", - "v2=260\n", + "v1 = 253\n", + "v2 = 260\n", + "\n", + "\n", "def run_func(args):\n", " year, month, split_cons = args\n", " df = iceberg_sql(f\"\"\"\n", @@ -680,11 +699,21 @@ " where uncurtailed_P < max_P_volt_watt and nonconformance_voltwatt = 0 and V > 253\n", " \"\"\")\n", " # sleep(random.randint(1, 10)) # add some randomness to avoid overwhelming trino with simultaneous queries\n", - " print(f\"Completed year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\")\n", + " print(\n", + " f\"Completed year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\"\n", + " )\n", " return df\n", "\n", - "tasks = [(year, month, split_cons) for year in (2024, ) for month in range(1, 2) \n", - " for split_cons in ['system.bucket(postcode, 16) <= 7', 'system.bucket(postcode, 16) > 7'] ]\n", + "\n", + "tasks = [\n", + " (year, month, split_cons)\n", + " for year in (2024,)\n", + " for month in range(1, 2)\n", + " for split_cons in [\n", + " \"system.bucket(postcode, 16) <= 7\",\n", + " \"system.bucket(postcode, 16) > 7\",\n", + " ]\n", + "]\n", "df = trino_parallel(run_func, tasks, num_workers=1)\n", "df" ] @@ -756,8 +785,14 @@ } ], "source": [ - "df = iceberg_sql(\"\"\"select * from all_uncurtailedPV where year=2024 and month=1 and site_id=45544685 and t_stamp = timestamp '2024-01-25 04:30:00'\"\"\")\n", - "df['t_stamp'] = pd.to_datetime(df['t_stamp']).dt.tz_localize('UTC').dt.tz_convert(pytz.FixedOffset(600))\n", + "df = iceberg_sql(\n", + " \"\"\"select * from all_uncurtailedPV where year=2024 and month=1 and site_id=45544685 and t_stamp = timestamp '2024-01-25 04:30:00'\"\"\"\n", + ")\n", + "df[\"t_stamp\"] = (\n", + " pd.to_datetime(df[\"t_stamp\"])\n", + " .dt.tz_localize(\"UTC\")\n", + " .dt.tz_convert(pytz.FixedOffset(600))\n", + ")\n", "df" ] }, @@ -774,10 +809,10 @@ " if V < v1:\n", " return Srated\n", " elif V > v2:\n", - " return .2 * Srated\n", + " return 0.2 * Srated\n", " else:\n", - " m = (Srated - .2*Srated) / (v1 - v2)\n", - " P = m * (V - v2) + .2*Srated\n", + " m = (Srated - 0.2 * Srated) / (v1 - v2)\n", + " P = m * (V - v2) + 0.2 * Srated\n", " return P" ] }, @@ -954,8 +989,8 @@ } ], "source": [ - "v1=253\n", - "v2=260\n", + "v1 = 253\n", + "v2 = 260\n", "iceberg_sql(f\"\"\"\n", " with data as (\n", " select site_id, t_stamp, sum(power*circuit_polarity/1000) as P_kW, \n", @@ -1059,7 +1094,9 @@ } ], "source": [ - "iceberg_sql(\"select circuit_id, circuit_polarity from meta_up23c where circuit_id = 467634\")" + "iceberg_sql(\n", + " \"select circuit_id, circuit_polarity from meta_up23c where circuit_id = 467634\"\n", + ")" ] }, { diff --git a/SolA2024_Analysis/Volt-Watt.ipynb b/SolA2024_Analysis/Volt-Watt.ipynb index a9ccd47..6e968e4 100644 --- a/SolA2024_Analysis/Volt-Watt.ipynb +++ b/SolA2024_Analysis/Volt-Watt.ipynb @@ -21,10 +21,11 @@ } ], "source": [ + "import pytz\n", + "\n", "from Data_query.spark_config import *\n", "from visualisation import *\n", "\n", - "import pytz\n", "spark.catalog.setCurrentDatabase(\"solar_analytics\")\n", "warehouse_dir = spark.conf.get(\"spark.sql.warehouse.dir\")\n", "print(warehouse_dir)" @@ -83,7 +84,9 @@ } ], "source": [ - "spark.sql(\"select * from compliance_voltwatt where site_id = 305063631 and month=1\").show(3)" + "spark.sql(\n", + " \"select * from compliance_voltwatt where site_id = 305063631 and month=1\"\n", + ").show(3)" ] }, { @@ -120,7 +123,9 @@ } ], "source": [ - "spark.sql(\"select * from meta_single_inverters_wrong_capacity_up2_3c where s_id = 'S3146'\").show(3)" + "spark.sql(\n", + " \"select * from meta_single_inverters_wrong_capacity_up2_3c where s_id = 'S3146'\"\n", + ").show(3)" ] }, { @@ -136,11 +141,13 @@ " if V < v1:\n", " return Srated\n", " elif V > v2:\n", - " return .2 * Srated\n", + " return 0.2 * Srated\n", " else:\n", - " m = (Srated - .2*Srated) / (v1 - v2)\n", - " P = m * (V - v2) + .2*Srated\n", + " m = (Srated - 0.2 * Srated) / (v1 - v2)\n", + " P = m * (V - v2) + 0.2 * Srated\n", " return P\n", + "\n", + "\n", "volt_watt_udf = udf(get_max_P, DoubleType())" ] }, @@ -163,35 +170,77 @@ "first_write = True\n", "for year in (2024, 2025):\n", " for month in range(1, 13):\n", - " ts = spark.read.table(\"ts\").filter(f\"is_pv = True and year={year} and month = {month}\")\\\n", + " ts = (\n", + " spark.read.table(\"ts\")\n", + " .filter(f\"is_pv = True and year={year} and month = {month}\")\n", " .select(\"circuit_id\", \"t_stamp\", \"power\", \"voltage\", \"year\", \"month\")\n", - " ts = ts.withColumn('voltage', when((col('voltage') > 300) | (col('voltage') < 50), None).otherwise(col('voltage')))\n", - " ts = ts.withColumn('P_kW', col('power') / 1000).drop('power')\n", - " meta = spark.read.table(\"meta_single_inverters_wrong_capacity_up2_3c\").select(\"circuit_id\",\"site_id\", \"s_id\", \"ac_capacity_kw\", 'circuit_polarity', 'wrong_capacity')\\\n", - " .filter(col(\"wrong_capacity\") == False).filter(col(\"ac_capacity_kw\").isNotNull()).drop(\"wrong_capacity\")\\\n", - " .withColumnRenamed(\"circuit_id\", \"circuit_id_meta\")\n", - " df = ts.join(meta, ts.circuit_id == meta.circuit_id_meta, \"inner\").drop(\"circuit_id_meta\")\n", - " df = df.withColumn('P_kW', col('P_kW') * col('circuit_polarity')).drop('circuit_polarity')\n", + " )\n", + " ts = ts.withColumn(\n", + " \"voltage\",\n", + " when((col(\"voltage\") > 300) | (col(\"voltage\") < 50), None).otherwise(\n", + " col(\"voltage\")\n", + " ),\n", + " )\n", + " ts = ts.withColumn(\"P_kW\", col(\"power\") / 1000).drop(\"power\")\n", + " meta = (\n", + " spark.read.table(\"meta_single_inverters_wrong_capacity_up2_3c\")\n", + " .select(\n", + " \"circuit_id\",\n", + " \"site_id\",\n", + " \"s_id\",\n", + " \"ac_capacity_kw\",\n", + " \"circuit_polarity\",\n", + " \"wrong_capacity\",\n", + " )\n", + " .filter(col(\"wrong_capacity\") == False)\n", + " .filter(col(\"ac_capacity_kw\").isNotNull())\n", + " .drop(\"wrong_capacity\")\n", + " .withColumnRenamed(\"circuit_id\", \"circuit_id_meta\")\n", + " )\n", + " df = ts.join(meta, ts.circuit_id == meta.circuit_id_meta, \"inner\").drop(\n", + " \"circuit_id_meta\"\n", + " )\n", + " df = df.withColumn(\"P_kW\", col(\"P_kW\") * col(\"circuit_polarity\")).drop(\n", + " \"circuit_polarity\"\n", + " )\n", " df = df.filter(col(\"voltage\") > 253)\n", - " df = df.groupBy(\"site_id\", \"s_id\",\"t_stamp\", \"year\", \"month\").agg(\n", + " df = df.groupBy(\"site_id\", \"s_id\", \"t_stamp\", \"year\", \"month\").agg(\n", " spark_sum(\"P_kW\").alias(\"P_kW\"),\n", " avg(\"voltage\").alias(\"voltage\"),\n", - " avg(\"ac_capacity_kw\").alias(\"ac_capacity_kw\")\n", + " avg(\"ac_capacity_kw\").alias(\"ac_capacity_kw\"),\n", + " )\n", + " df = df.withColumn(\n", + " \"max_P_volt_watt\",\n", + " volt_watt_udf(df.voltage, df.ac_capacity_kw) + 0.04 * df.ac_capacity_kw,\n", " )\n", - " df = df.withColumn(\"max_P_volt_watt\", volt_watt_udf(df.voltage, df.ac_capacity_kw) + 0.04*df.ac_capacity_kw)\n", " df = df.drop(\"ac_capacity_kw\")\n", - " df = df.withColumn(\"noncompliance_voltwatt\", greatest(col('P_kW') - col('max_P_volt_watt'), lit(0)))\n", + " df = df.withColumn(\n", + " \"noncompliance_voltwatt\",\n", + " greatest(col(\"P_kW\") - col(\"max_P_volt_watt\"), lit(0)),\n", + " )\n", " df = df.withColumn(\"day\", dayofmonth(col(\"t_stamp\")))\n", - " comp = df.groupBy('year','month',\"day\",\"site_id\", \"s_id\")\\\n", - " .agg(spark_sum(when(col(\"noncompliance_voltwatt\") > 0, 1).otherwise(0)).alias(\"noncompliance_voltwatt_count\"), \\\n", - " spark_sum(col(\"noncompliance_voltwatt\")).alias(\"noncompliance_voltwatt_sum\"),\n", - " spark_count(col(\"noncompliance_voltwatt\")).alias(\"total_count\"))\\\n", + " comp = (\n", + " df.groupBy(\"year\", \"month\", \"day\", \"site_id\", \"s_id\")\n", + " .agg(\n", + " spark_sum(\n", + " when(col(\"noncompliance_voltwatt\") > 0, 1).otherwise(0)\n", + " ).alias(\"noncompliance_voltwatt_count\"),\n", + " spark_sum(col(\"noncompliance_voltwatt\")).alias(\n", + " \"noncompliance_voltwatt_sum\"\n", + " ),\n", + " spark_count(col(\"noncompliance_voltwatt\")).alias(\"total_count\"),\n", + " )\n", " .sort(\"noncompliance_voltwatt_count\", ascending=False)\n", + " )\n", " if first_write:\n", - " comp.write.mode(\"overwrite\").parquet(f\"{warehouse_dir}/Compliance_results_SolA/compliance_voltwatt.parquet\")\n", + " comp.write.mode(\"overwrite\").parquet(\n", + " f\"{warehouse_dir}/Compliance_results_SolA/compliance_voltwatt.parquet\"\n", + " )\n", " first_write = False\n", " else:\n", - " comp.write.mode(\"append\").parquet(f\"{warehouse_dir}/Compliance_results_SolA/compliance_voltwatt.parquet\")" + " comp.write.mode(\"append\").parquet(\n", + " f\"{warehouse_dir}/Compliance_results_SolA/compliance_voltwatt.parquet\"\n", + " )" ] }, { @@ -426,7 +475,9 @@ } ], "source": [ - "spark.read.table(\"meta_single_inverters_wrong_capacity\").filter(\"circuit_id = 454093\").show()" + "spark.read.table(\"meta_single_inverters_wrong_capacity\").filter(\n", + " \"circuit_id = 454093\"\n", + ").show()" ] }, { @@ -445,7 +496,9 @@ ], "source": [ "dfp = df.filter(\"site_id = 465008538\").toPandas()\n", - "dfp['t_stamp'] = dfp['t_stamp'].dt.tz_localize('UTC').dt.tz_convert(pytz.FixedOffset(10*60)) # Convert to UTC+110" + "dfp[\"t_stamp\"] = (\n", + " dfp[\"t_stamp\"].dt.tz_localize(\"UTC\").dt.tz_convert(pytz.FixedOffset(10 * 60))\n", + ") # Convert to UTC+110" ] }, { @@ -588,27 +641,59 @@ } ], "source": [ - "start_time = '2024-01-08 00:00:00+10:00'\t # In sydney local time\n", - "end_time = '2024-01-09 00:00:00+10:00'\t # In sydney local time\n", + "start_time = \"2024-01-08 00:00:00+10:00\" # In sydney local time\n", + "end_time = \"2024-01-09 00:00:00+10:00\" # In sydney local time\n", "\n", "\n", - "num_ticks = 12*2+1\n", - "save_as = ''\n", - "x_label = 'time'\n", - "y_labels = ['Active power (kW)', 'Active power (kW)', 'voltage (V)']\n", - "plt_config = {'max_P_volt_watt': [0, 0, '-.', None, None], 'P_kW': [0, 0, '-', None, None],\n", - "'voltage': [0, 1, '-', None, None]}\n", + "num_ticks = 12 * 2 + 1\n", + "save_as = \"\"\n", + "x_label = \"time\"\n", + "y_labels = [\"Active power (kW)\", \"Active power (kW)\", \"voltage (V)\"]\n", + "plt_config = {\n", + " \"max_P_volt_watt\": [0, 0, \"-.\", None, None],\n", + " \"P_kW\": [0, 0, \"-\", None, None],\n", + " \"voltage\": [0, 1, \"-\", None, None],\n", + "}\n", "\n", - "color_nights=False\n", + "color_nights = False\n", "# color_by = 'group'\n", - "color_by = 'attribute'\n", - "ax_digit = '1.1f'\n", - "a=my_plot4(start_time, end_time, dfp, plt_config=plt_config, ax_digit= ax_digit,\n", - " group_attr='site_id', time_attr='t_stamp', color_nights=color_nights,cmap='viridis',\n", - " figsize=[14/2.54,2], same_scale=1, fontsize=7, fontname='DejaVu Sans', plot_total=False, plot_total_func=['sum', [lambda x: max(x), 'max']], \n", - " num_ticks=num_ticks, num_yticks=10, dpi=300, x_format= '%H:%M', \n", - " legend_loc=['upper left', 'upper right', 'upper right'], x_label=x_label, y_labels=y_labels, color_by=color_by,\n", - "plot_period=np.timedelta64(1, 'D'), save_as=save_as, rotation = 60, step=0, gridwidth=[0.2, .2], legend_join='-', title='', legend_i=0, title_i=0, only1title=0)\n", + "color_by = \"attribute\"\n", + "ax_digit = \"1.1f\"\n", + "a = my_plot4(\n", + " start_time,\n", + " end_time,\n", + " dfp,\n", + " plt_config=plt_config,\n", + " ax_digit=ax_digit,\n", + " group_attr=\"site_id\",\n", + " time_attr=\"t_stamp\",\n", + " color_nights=color_nights,\n", + " cmap=\"viridis\",\n", + " figsize=[14 / 2.54, 2],\n", + " same_scale=1,\n", + " fontsize=7,\n", + " fontname=\"DejaVu Sans\",\n", + " plot_total=False,\n", + " plot_total_func=[\"sum\", [lambda x: max(x), \"max\"]],\n", + " num_ticks=num_ticks,\n", + " num_yticks=10,\n", + " dpi=300,\n", + " x_format=\"%H:%M\",\n", + " legend_loc=[\"upper left\", \"upper right\", \"upper right\"],\n", + " x_label=x_label,\n", + " y_labels=y_labels,\n", + " color_by=color_by,\n", + " plot_period=np.timedelta64(1, \"D\"),\n", + " save_as=save_as,\n", + " rotation=60,\n", + " step=0,\n", + " gridwidth=[0.2, 0.2],\n", + " legend_join=\"-\",\n", + " title=\"\",\n", + " legend_i=0,\n", + " title_i=0,\n", + " only1title=0,\n", + ")\n", "a.do()" ] }, @@ -1669,7 +1754,9 @@ } ], "source": [ - "spark.sql(\"select * from circuits left join sites on circuits.site_id = sites.site_id\").filter(\"circuit_id = 75410\").show()" + "spark.sql(\n", + " \"select * from circuits left join sites on circuits.site_id = sites.site_id\"\n", + ").filter(\"circuit_id = 75410\").show()" ] }, { @@ -1749,8 +1836,10 @@ } ], "source": [ - "spark.sql(\"select * from ts \\\n", - "where circuit_id in (224045, 224043, 224044) and year = 2024 and month=1 and hour(t_stamp) > 1 order by t_stamp limit 10\").show()" + "spark.sql(\n", + " \"select * from ts \\\n", + "where circuit_id in (224045, 224043, 224044) and year = 2024 and month=1 and hour(t_stamp) > 1 order by t_stamp limit 10\"\n", + ").show()" ] }, { @@ -1773,8 +1862,10 @@ } ], "source": [ - "spark.sql(\"select * from sites \\\n", - "where site_id = 1844098728\").show()" + "spark.sql(\n", + " \"select * from sites \\\n", + "where site_id = 1844098728\"\n", + ").show()" ] }, { @@ -1816,10 +1907,12 @@ } ], "source": [ - "spark.sql(\"select site_id from meta_single_inverters_wrong_capacity \\\n", + "spark.sql(\n", + " \"select site_id from meta_single_inverters_wrong_capacity \\\n", "group by site_id \\\n", "having count(circuit_id) > 1 \\\n", - " and cast(avg(ac_capacity_kw) as integer) = cast(min(ac_capacity_kw) as integer)\").show()" + " and cast(avg(ac_capacity_kw) as integer) = cast(min(ac_capacity_kw) as integer)\"\n", + ").show()" ] }, { @@ -1905,7 +1998,8 @@ } ], "source": [ - "spark.sql(\"with data as (\\\n", + "spark.sql(\n", + " \"with data as (\\\n", "select site_id, t, avg(P*circuit_polarity) as P \\\n", "from \\\n", " (select circuit_id, t_stamp as t, power/1000 as P \\\n", @@ -1919,7 +2013,8 @@ "group by site_id, t), \\\n", "negative_powers as (select site_id, percentile_approx(P, .01) as p01, min(P) as P_min from data \\\n", " group by site_id) \\\n", - " select count(site_id) as count_sites from negative_powers where P_min < -1\").show(20, truncate=False)" + " select count(site_id) as count_sites from negative_powers where P_min < -1\"\n", + ").show(20, truncate=False)" ] } ], diff --git a/SolA2024_Analysis/curtailment_voltvar.ipynb b/SolA2024_Analysis/curtailment_voltvar.ipynb index 790ad1a..bedb462 100644 --- a/SolA2024_Analysis/curtailment_voltvar.ipynb +++ b/SolA2024_Analysis/curtailment_voltvar.ipynb @@ -7,13 +7,16 @@ "metadata": {}, "outputs": [], "source": [ - "from Data_query.trino_config import *\n", - "import numpy as np\n", - "import boto3\n", "import random\n", + "\n", + "import boto3\n", + "import numpy as np\n", "import pytz\n", + "\n", + "from Data_query.trino_config import *\n", + "\n", "session = boto3.Session()\n", - "s3 = boto3.client('s3')" + "s3 = boto3.client(\"s3\")" ] }, { @@ -54,7 +57,7 @@ "big_workers = 1\n", "workers = 0\n", "num_workers = max(workers, big_workers)\n", - "ensure_trino_running(worker_desired_count = workers, big_worker_desired_count=big_workers)\n", + "ensure_trino_running(worker_desired_count=workers, big_worker_desired_count=big_workers)\n", "sleep(60) # wait for trino to be ready" ] }, @@ -77,17 +80,19 @@ } ], "source": [ - "v1=207\n", - "v2=220\n", - "v3=240\n", - "v4=258\n", + "v1 = 207\n", + "v2 = 220\n", + "v3 = 240\n", + "v4 = 258\n", "voltwatt_V = 253\n", - "Q1=.44\n", - "Q4=.60\n", + "Q1 = 0.44\n", + "Q4 = 0.60\n", "thr1 = -0.1\n", "thr2 = 0.1\n", - "thr3 = .9\n", + "thr3 = 0.9\n", "thr4 = 1.1\n", + "\n", + "\n", "def run_func(args):\n", " year, month, split_cons = args\n", " df = iceberg_sql(f\"\"\"\n", @@ -195,12 +200,23 @@ " group by year, month, day, day_night, site_id\n", " \"\"\")\n", " # sleep(20)\n", - " print(f\"Completed year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\")\n", + " print(\n", + " f\"Completed year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\"\n", + " )\n", " return df\n", "\n", - "tasks = [(year, month, split_cons) for year in (2024, ) for month in range(1, 2) \n", - " for split_cons in ['system.bucket(postcode, 16) <= 3', '(system.bucket(postcode, 16) > 3 and system.bucket(postcode, 16) <= 7)', \n", - " '(system.bucket(postcode, 16) > 7 and system.bucket(postcode, 16) <= 11)', 'system.bucket(postcode, 16) > 11'] ]\n", + "\n", + "tasks = [\n", + " (year, month, split_cons)\n", + " for year in (2024,)\n", + " for month in range(1, 2)\n", + " for split_cons in [\n", + " \"system.bucket(postcode, 16) <= 3\",\n", + " \"(system.bucket(postcode, 16) > 3 and system.bucket(postcode, 16) <= 7)\",\n", + " \"(system.bucket(postcode, 16) > 7 and system.bucket(postcode, 16) <= 11)\",\n", + " \"system.bucket(postcode, 16) > 11\",\n", + " ]\n", + "]\n", "df = trino_parallel(run_func, tasks, num_workers=num_workers)" ] }, @@ -600,7 +616,7 @@ } ], "source": [ - "df.sort_values('curtailment_voltvar_sum', ascending=False)" + "df.sort_values(\"curtailment_voltvar_sum\", ascending=False)" ] }, { @@ -610,7 +626,7 @@ "metadata": {}, "outputs": [], "source": [ - "# 13 timestamps with conformant: P < threshold and 6 uncurtailed > threshold, 7 " + "# 13 timestamps with conformant: P < threshold and 6 uncurtailed > threshold, 7" ] }, { @@ -631,7 +647,7 @@ } ], "source": [ - "71-58" + "71 - 58" ] }, { diff --git a/SolA2024_Analysis/curtailment_voltwattghi.ipynb b/SolA2024_Analysis/curtailment_voltwattghi.ipynb index e46afe4..d1e7583 100644 --- a/SolA2024_Analysis/curtailment_voltwattghi.ipynb +++ b/SolA2024_Analysis/curtailment_voltwattghi.ipynb @@ -7,13 +7,16 @@ "metadata": {}, "outputs": [], "source": [ - "from Data_query.trino_config import *\n", - "import numpy as np\n", - "import boto3\n", "import random\n", + "\n", + "import boto3\n", + "import numpy as np\n", "import pytz\n", + "\n", + "from Data_query.trino_config import *\n", + "\n", "session = boto3.Session()\n", - "s3 = boto3.client('s3')" + "s3 = boto3.client(\"s3\")" ] }, { @@ -54,7 +57,7 @@ "big_workers = 1\n", "workers = 0\n", "num_workers = max(workers, big_workers)\n", - "ensure_trino_running(worker_desired_count = workers, big_worker_desired_count=big_workers)\n", + "ensure_trino_running(worker_desired_count=workers, big_worker_desired_count=big_workers)\n", "sleep(60) # wait for trino to be ready" ] }, @@ -326,8 +329,10 @@ } ], "source": [ - "v1=253\n", - "v2=260\n", + "v1 = 253\n", + "v2 = 260\n", + "\n", + "\n", "def run_func(args):\n", " year, month, split_cons = args\n", " df = iceberg_sql(f\"\"\"\n", @@ -365,11 +370,21 @@ " order by nonconformance_voltwattghi_sum desc\n", " \"\"\")\n", " # sleep(random.randint(1, 10)) # add some randomness to avoid overwhelming trino with simultaneous queries\n", - " print(f\"Completed year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\")\n", + " print(\n", + " f\"Completed year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\"\n", + " )\n", " return df\n", "\n", - "tasks = [(year, month, split_cons) for year in (2024, ) for month in range(1, 13) \n", - " for split_cons in ['system.bucket(postcode, 16) <= 7', 'system.bucket(postcode, 16) > 7'] ]\n", + "\n", + "tasks = [\n", + " (year, month, split_cons)\n", + " for year in (2024,)\n", + " for month in range(1, 13)\n", + " for split_cons in [\n", + " \"system.bucket(postcode, 16) <= 7\",\n", + " \"system.bucket(postcode, 16) > 7\",\n", + " ]\n", + "]\n", "df = trino_parallel(run_func, tasks, num_workers=num_workers)\n", "df" ] @@ -381,7 +396,7 @@ "metadata": {}, "outputs": [], "source": [ - "# 13 timestamps with conformant: P < threshold and 6 uncurtailed > threshold, 7 " + "# 13 timestamps with conformant: P < threshold and 6 uncurtailed > threshold, 7" ] }, { @@ -402,7 +417,7 @@ } ], "source": [ - "71-58" + "71 - 58" ] }, { diff --git a/SolA2024_Analysis/ghi_pv_estimator.ipynb b/SolA2024_Analysis/ghi_pv_estimator.ipynb index f3e750f..275312b 100644 --- a/SolA2024_Analysis/ghi_pv_estimator.ipynb +++ b/SolA2024_Analysis/ghi_pv_estimator.ipynb @@ -7,10 +7,11 @@ "metadata": {}, "outputs": [], "source": [ - "from Data_query.trino_config import *\n", "import numpy as np\n", - "from visualisation import *\n", - "import pytz" + "import pytz\n", + "\n", + "from Data_query.trino_config import *\n", + "from visualisation import *" ] }, { @@ -51,7 +52,7 @@ "big_workers = 3\n", "workers = 0\n", "num_workers = max(workers, big_workers)\n", - "ensure_trino_running(worker_desired_count = workers, big_worker_desired_count=big_workers)\n", + "ensure_trino_running(worker_desired_count=workers, big_worker_desired_count=big_workers)\n", "sleep(40)" ] }, @@ -295,11 +296,15 @@ "# num_workers = max(workers, big_workers)\n", "# ensure_trino_running(worker_desired_count = workers, big_worker_desired_count=big_workers)\n", "sleep(60)\n", + "\n", + "\n", "def run_func(args):\n", " year, month, split_cons = args\n", " # time_filter = f\"year = {year} and month = {month}\"\n", " time_filter = f\"year = {year}\"\n", - " meta_filters = f\"is_pv=True and {split_cons} and flex_export_detected=False and pf_01 >= .98\"\n", + " meta_filters = (\n", + " f\"is_pv=True and {split_cons} and flex_export_detected=False and pf_01 >= .98\"\n", + " )\n", " # meta_filters = f\"is_pv=True and {split_cons} and flex_export_detected=False and site_id in (1669657679,1947677239)\"\n", " df = iceberg_sql(f\"\"\"\n", " with data as \n", @@ -511,14 +516,24 @@ " \n", " \"\"\")\n", " # sleep(20)\n", - " print(f\"Completed {time_filter}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\")\n", + " print(\n", + " f\"Completed {time_filter}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\"\n", + " )\n", " return df\n", - "tasks = [(year, month, split_cons) for year in (2024, ) for month in range(1, 2) \n", - " for split_cons in [f'system.bucket(postcode, 16) = {i}' for i in range(0, 16)]]\n", - " # for split_cons in ['system.bucket(postcode, 16) > -1'] ]\n", - " \n", - "try: \n", - " res = trino_parallel_batch(run_func, tasks, num_workers=num_workers, batch_size=num_workers)\n", + "\n", + "\n", + "tasks = [\n", + " (year, month, split_cons)\n", + " for year in (2024,)\n", + " for month in range(1, 2)\n", + " for split_cons in [f\"system.bucket(postcode, 16) = {i}\" for i in range(0, 16)]\n", + "]\n", + "# for split_cons in ['system.bucket(postcode, 16) > -1'] ]\n", + "\n", + "try:\n", + " res = trino_parallel_batch(\n", + " run_func, tasks, num_workers=num_workers, batch_size=num_workers\n", + " )\n", "except Exception as e:\n", " print(f\"Error during data retrieval: {e}\")\n", "finally:\n", @@ -544,7 +559,7 @@ } ], "source": [ - "res['site_id'].nunique()" + "res[\"site_id\"].nunique()" ] }, { @@ -595,14 +610,26 @@ } ], "source": [ - "res.hist(column='r2', bins=20, edgecolor='k'), \n", - "plt.xlabel('R² Value'), plt.ylabel('Frequency'), plt.title('Distribution of R² Values')\n", - "res.hist(column='mape', bins=20, edgecolor='k')\n", - "plt.xlabel('MAPE Value'), plt.ylabel('Frequency'), plt.title('Distribution of MAPE Values')\n", - "res.hist(column='bias', bins=20, edgecolor='k')\n", - "plt.xlabel('Bias Value'), plt.ylabel('Frequency'), plt.title('Distribution of Bias Values')\n", - "res.hist(column='rmse', bins=20, edgecolor='k')\n", - "plt.xlabel('RMSE Value'), plt.ylabel('Frequency'), plt.title('Distribution of RMSE Values')\n", + "(res.hist(column=\"r2\", bins=20, edgecolor=\"k\"),)\n", + "plt.xlabel(\"R² Value\"), plt.ylabel(\"Frequency\"), plt.title(\"Distribution of R² Values\")\n", + "res.hist(column=\"mape\", bins=20, edgecolor=\"k\")\n", + "(\n", + " plt.xlabel(\"MAPE Value\"),\n", + " plt.ylabel(\"Frequency\"),\n", + " plt.title(\"Distribution of MAPE Values\"),\n", + ")\n", + "res.hist(column=\"bias\", bins=20, edgecolor=\"k\")\n", + "(\n", + " plt.xlabel(\"Bias Value\"),\n", + " plt.ylabel(\"Frequency\"),\n", + " plt.title(\"Distribution of Bias Values\"),\n", + ")\n", + "res.hist(column=\"rmse\", bins=20, edgecolor=\"k\")\n", + "(\n", + " plt.xlabel(\"RMSE Value\"),\n", + " plt.ylabel(\"Frequency\"),\n", + " plt.title(\"Distribution of RMSE Values\"),\n", + ")\n", "plt.show()" ] }, @@ -632,7 +659,7 @@ } ], "source": [ - "res['rmse'].describe()" + "res[\"rmse\"].describe()" ] }, { @@ -665,7 +692,7 @@ } ], "source": [ - "res.query(f'rmse > 15')['site_id']" + "res.query(f\"rmse > 15\")[\"site_id\"]" ] }, { @@ -716,8 +743,10 @@ "# big_workers = 1\n", "# workers = 0\n", "# num_workers = max(workers, big_workers)\n", - "ensure_trino_running(worker_desired_count = workers, big_worker_desired_count=big_workers)\n", + "ensure_trino_running(worker_desired_count=workers, big_worker_desired_count=big_workers)\n", "sleep(110)\n", + "\n", + "\n", "def run_func(args):\n", " year, month, split_cons = args\n", " time_filter = f\"year = {year} and month = {month}\"\n", @@ -914,18 +943,30 @@ " \n", " \n", " \"\"\")\n", - " print(f\"Completed {time_filter}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\")\n", + " print(\n", + " f\"Completed {time_filter}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\"\n", + " )\n", " return df\n", - "tasks = [(year, month, split_cons) for year in (2024, ) for month in range(1, 2) \n", - " for split_cons in [f'system.bucket(postcode, 16) = {i}' for i in range(0, 16)]]\n", - " # for split_cons in ['system.bucket(postcode, 16) > -1'] ]\n", - " \n", - "try: \n", + "\n", + "\n", + "tasks = [\n", + " (year, month, split_cons)\n", + " for year in (2024,)\n", + " for month in range(1, 2)\n", + " for split_cons in [f\"system.bucket(postcode, 16) = {i}\" for i in range(0, 16)]\n", + "]\n", + "# for split_cons in ['system.bucket(postcode, 16) > -1'] ]\n", + "\n", + "try:\n", " df = trino_parallel(run_func, tasks, num_workers=num_workers)\n", - " df['t_stamp'] = pd.to_datetime(df['t_stamp']).dt.tz_localize('utc').dt.tz_convert(pytz.FixedOffset(10*60))\n", - " df['GHI'] = df['GHI'].fillna(-1)\n", + " df[\"t_stamp\"] = (\n", + " pd.to_datetime(df[\"t_stamp\"])\n", + " .dt.tz_localize(\"utc\")\n", + " .dt.tz_convert(pytz.FixedOffset(10 * 60))\n", + " )\n", + " df[\"GHI\"] = df[\"GHI\"].fillna(-1)\n", " # df['GHI_cs'] = df['GHI_cs'].fillna(-1)\n", - " df['cloud_type'] = df['cloud_type'].fillna(-1)\n", + " df[\"cloud_type\"] = df[\"cloud_type\"].fillna(-1)\n", " # df['cloud_type_cs'] = df['cloud_type_cs'].fillna(-1)\n", "except Exception as e:\n", " print(f\"Error during data retrieval: {e}\")\n", @@ -954,7 +995,7 @@ } ], "source": [ - "df['site_id'].unique()" + "df[\"site_id\"].unique()" ] }, { @@ -1181,7 +1222,9 @@ } ], "source": [ - "df.merge(res[['site_id', 'r2', 'mape', 'rmse','n']], on='site_id', how='left').query(f'site_id=={1478393681}')" + "df.merge(res[[\"site_id\", \"r2\", \"mape\", \"rmse\", \"n\"]], on=\"site_id\", how=\"left\").query(\n", + " f\"site_id=={1478393681}\"\n", + ")" ] }, { @@ -1420,7 +1463,9 @@ } ], "source": [ - "df.merge(res[['site_id', 'r2', 'mape', 'rmse','n']], on='site_id', how='left').drop_duplicates(subset=['site_id']).sort_values('rmse', ascending=False)" + "df.merge(\n", + " res[[\"site_id\", \"r2\", \"mape\", \"rmse\", \"n\"]], on=\"site_id\", how=\"left\"\n", + ").drop_duplicates(subset=[\"site_id\"]).sort_values(\"rmse\", ascending=False)" ] }, { @@ -1430,7 +1475,7 @@ "metadata": {}, "outputs": [], "source": [ - "id_counter=-1" + "id_counter = -1" ] }, { @@ -1477,38 +1522,74 @@ "# sample_site_id = sample_site_id[sample_site_id[0] > 0]['site_id'].tolist()\n", "# df0 = df.query(f\"site_id=={sample_site_id[id_counter]}\").reset_index(drop=True)\n", "df0 = df.query(f\"site_id=={sample_site_id}\").reset_index(drop=True)\n", - "t0 = df0['t_stamp'].min()\n", - "t1 = df0['t_stamp'].max()\n", + "t0 = df0[\"t_stamp\"].min()\n", + "t1 = df0[\"t_stamp\"].max()\n", "# t1 = t0 + pd.Timedelta(days=6)\n", - "if t0.time() == pd.Timestamp('00:00:00').time():\n", - " start_time = t0.strftime('%Y-%m-%d %H:%M:%S%z')\n", + "if t0.time() == pd.Timestamp(\"00:00:00\").time():\n", + " start_time = t0.strftime(\"%Y-%m-%d %H:%M:%S%z\")\n", "else:\n", - " start_time = (t0 + pd.Timedelta(days=1)).replace(hour=0, minute=0, second=0).strftime('%Y-%m-%d %H:%M:%S%z')\n", - "if t1.time() == pd.Timestamp('00:00:00').time():\n", - " end_time = t1.strftime('%Y-%m-%d %H:%M:%S%z')\n", + " start_time = (\n", + " (t0 + pd.Timedelta(days=1))\n", + " .replace(hour=0, minute=0, second=0)\n", + " .strftime(\"%Y-%m-%d %H:%M:%S%z\")\n", + " )\n", + "if t1.time() == pd.Timestamp(\"00:00:00\").time():\n", + " end_time = t1.strftime(\"%Y-%m-%d %H:%M:%S%z\")\n", "else:\n", - " end_time = t1.replace(hour=0, minute=0, second=0).strftime('%Y-%m-%d %H:%M:%S%z')\n", + " end_time = t1.replace(hour=0, minute=0, second=0).strftime(\"%Y-%m-%d %H:%M:%S%z\")\n", "\n", - "num_ticks = 24*2+1\n", - "save_as = 'Figures/Test2.jpeg'\n", - "x_label = 'time'\n", - "y_labels = ['GHI', 'Cloud Type', 'Active Power (kW)', 'Active Power (kW)']\n", + "num_ticks = 24 * 2 + 1\n", + "save_as = \"Figures/Test2.jpeg\"\n", + "x_label = \"time\"\n", + "y_labels = [\"GHI\", \"Cloud Type\", \"Active Power (kW)\", \"Active Power (kW)\"]\n", "plt_config = {\n", - " 'GHI': [0, 0, '-', None, None],'cloud_type': [0, 1, '-', None, None],\n", - " 'P_kw': [1, 0, '-', None, None],'P_kw_est': [1, 0, '-', None, None],\n", + " \"GHI\": [0, 0, \"-\", None, None],\n", + " \"cloud_type\": [0, 1, \"-\", None, None],\n", + " \"P_kw\": [1, 0, \"-\", None, None],\n", + " \"P_kw_est\": [1, 0, \"-\", None, None],\n", "}\n", "\n", - "color_nights=False\n", + "color_nights = False\n", "# color_by = 'group'\n", - "color_by = 'attribute'\n", - "ax_digit = '1.1f'\n", - "a=my_plot4(start_time, end_time, df0, plt_config=plt_config, ax_digit= ax_digit,\n", - " group_attr='site_id', time_attr='t_stamp', color_nights=color_nights,cmap='plasma',\n", - " figsize=[16/2.54,1.3], same_scale=1, fontsize=5, fontname='DejaVu Sans', plot_total=False, plot_total_func=['sum', [lambda x: max(x), 'max']], \n", - " num_ticks=num_ticks, num_yticks=10, dpi=200, x_format= '%H:%M', \n", - " legend_loc=['upper left', 'upper right', 'center left','upper left'], x_label=x_label, y_labels=y_labels, color_by=color_by,\n", - "plot_period=np.timedelta64(1, 'D'), save_as=save_as, rotation = 60, step=0, gridwidth=[0.2, .2], legend_join='-', title='', \n", - "legend_i=0, title_i=0, only1title=0, onlyntime=0, show=False)\n", + "color_by = \"attribute\"\n", + "ax_digit = \"1.1f\"\n", + "a = my_plot4(\n", + " start_time,\n", + " end_time,\n", + " df0,\n", + " plt_config=plt_config,\n", + " ax_digit=ax_digit,\n", + " group_attr=\"site_id\",\n", + " time_attr=\"t_stamp\",\n", + " color_nights=color_nights,\n", + " cmap=\"plasma\",\n", + " figsize=[16 / 2.54, 1.3],\n", + " same_scale=1,\n", + " fontsize=5,\n", + " fontname=\"DejaVu Sans\",\n", + " plot_total=False,\n", + " plot_total_func=[\"sum\", [lambda x: max(x), \"max\"]],\n", + " num_ticks=num_ticks,\n", + " num_yticks=10,\n", + " dpi=200,\n", + " x_format=\"%H:%M\",\n", + " legend_loc=[\"upper left\", \"upper right\", \"center left\", \"upper left\"],\n", + " x_label=x_label,\n", + " y_labels=y_labels,\n", + " color_by=color_by,\n", + " plot_period=np.timedelta64(1, \"D\"),\n", + " save_as=save_as,\n", + " rotation=60,\n", + " step=0,\n", + " gridwidth=[0.2, 0.2],\n", + " legend_join=\"-\",\n", + " title=\"\",\n", + " legend_i=0,\n", + " title_i=0,\n", + " only1title=0,\n", + " onlyntime=0,\n", + " show=False,\n", + ")\n", "a.do()" ] }, @@ -1532,8 +1613,10 @@ } ], "source": [ - "ensure_trino_running(worker_desired_count = workers, big_worker_desired_count=big_workers)\n", + "ensure_trino_running(worker_desired_count=workers, big_worker_desired_count=big_workers)\n", "sleep(50)\n", + "\n", + "\n", "def run_func(args):\n", " year, month, split_cons = args\n", " # meta_filters = f\"is_pv=True and {split_cons} and flex_export_detected=False and pf_01 >= .98\"\n", @@ -1697,18 +1780,30 @@ " \n", " \n", " \"\"\")\n", - " print(f\"Completed year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\")\n", + " print(\n", + " f\"Completed year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\"\n", + " )\n", " return df\n", - "tasks = [(year, month, split_cons) for year in (2024, ) for month in range(1, 2) \n", - " for split_cons in ['system.bucket(postcode, 16) > -1'] ]\n", - " \n", - "try: \n", + "\n", + "\n", + "tasks = [\n", + " (year, month, split_cons)\n", + " for year in (2024,)\n", + " for month in range(1, 2)\n", + " for split_cons in [\"system.bucket(postcode, 16) > -1\"]\n", + "]\n", + "\n", + "try:\n", " df = trino_parallel(run_func, tasks, num_workers=num_workers)\n", - " df['t_stamp'] = pd.to_datetime(df['t_stamp']).dt.tz_localize('utc').dt.tz_convert(pytz.FixedOffset(10*60))\n", - " df['GHI'] = df['GHI'].fillna(-1)\n", - " df['GHI_cs'] = df['GHI_cs'].fillna(-1)\n", - " df['cloud_type'] = df['cloud_type'].fillna(-1)\n", - " df['cloud_type_cs'] = df['cloud_type_cs'].fillna(-1)\n", + " df[\"t_stamp\"] = (\n", + " pd.to_datetime(df[\"t_stamp\"])\n", + " .dt.tz_localize(\"utc\")\n", + " .dt.tz_convert(pytz.FixedOffset(10 * 60))\n", + " )\n", + " df[\"GHI\"] = df[\"GHI\"].fillna(-1)\n", + " df[\"GHI_cs\"] = df[\"GHI_cs\"].fillna(-1)\n", + " df[\"cloud_type\"] = df[\"cloud_type\"].fillna(-1)\n", + " df[\"cloud_type_cs\"] = df[\"cloud_type_cs\"].fillna(-1)\n", "except Exception as e:\n", " print(f\"Error during data retrieval: {e}\")\n", "finally:\n", @@ -1746,7 +1841,7 @@ } ], "source": [ - "df['site_id'].unique()" + "df[\"site_id\"].unique()" ] }, { @@ -1881,7 +1976,7 @@ } ], "source": [ - "df[df['actual_tod']=='0 days 06:00:00']" + "df[df[\"actual_tod\"] == \"0 days 06:00:00\"]" ] }, { @@ -1902,7 +1997,7 @@ } ], "source": [ - "df['site_id'].unique()" + "df[\"site_id\"].unique()" ] }, { @@ -1963,7 +2058,7 @@ } ], "source": [ - "df['actual_tod'][100:140]" + "df[\"actual_tod\"][100:140]" ] }, { @@ -2020,41 +2115,95 @@ } ], "source": [ - "id_counter+=1\n", - "sample_site_id = df.groupby('site_id').size().reset_index()\n", - "sample_site_id = sample_site_id[sample_site_id[0] > 0]['site_id'].tolist()\n", + "id_counter += 1\n", + "sample_site_id = df.groupby(\"site_id\").size().reset_index()\n", + "sample_site_id = sample_site_id[sample_site_id[0] > 0][\"site_id\"].tolist()\n", "df0 = df.query(f\"site_id=={sample_site_id[id_counter]}\").reset_index(drop=True)\n", - "t0 = df0['t_stamp'].min()\n", - "t1 = df0['t_stamp'].max()\n", - "if t0.time() == pd.Timestamp('00:00:00').time():\n", - " start_time = t0.strftime('%Y-%m-%d %H:%M:%S%z')\n", + "t0 = df0[\"t_stamp\"].min()\n", + "t1 = df0[\"t_stamp\"].max()\n", + "if t0.time() == pd.Timestamp(\"00:00:00\").time():\n", + " start_time = t0.strftime(\"%Y-%m-%d %H:%M:%S%z\")\n", "else:\n", - " start_time = (t0 + pd.Timedelta(days=1)).replace(hour=0, minute=0, second=0).strftime('%Y-%m-%d %H:%M:%S%z')\n", - "if t1.time() == pd.Timestamp('00:00:00').time():\n", - " end_time = t1.strftime('%Y-%m-%d %H:%M:%S%z')\n", + " start_time = (\n", + " (t0 + pd.Timedelta(days=1))\n", + " .replace(hour=0, minute=0, second=0)\n", + " .strftime(\"%Y-%m-%d %H:%M:%S%z\")\n", + " )\n", + "if t1.time() == pd.Timestamp(\"00:00:00\").time():\n", + " end_time = t1.strftime(\"%Y-%m-%d %H:%M:%S%z\")\n", "else:\n", - " end_time = t1.replace(hour=0, minute=0, second=0).strftime('%Y-%m-%d %H:%M:%S%z')\n", + " end_time = t1.replace(hour=0, minute=0, second=0).strftime(\"%Y-%m-%d %H:%M:%S%z\")\n", "\n", - "num_ticks = 24*2+1\n", - "save_as = 'Figures/Test.jpeg'\n", - "x_label = 'time'\n", - "y_labels = ['GHI', 'Cloud Type', 'Active Power (kW)', 'Active Power (kW)','GHI', 'Cloud Type', 'Active Power (kW)', 'Active Power (kW)']\n", + "num_ticks = 24 * 2 + 1\n", + "save_as = \"Figures/Test.jpeg\"\n", + "x_label = \"time\"\n", + "y_labels = [\n", + " \"GHI\",\n", + " \"Cloud Type\",\n", + " \"Active Power (kW)\",\n", + " \"Active Power (kW)\",\n", + " \"GHI\",\n", + " \"Cloud Type\",\n", + " \"Active Power (kW)\",\n", + " \"Active Power (kW)\",\n", + "]\n", "plt_config = {\n", - " 'GHI': [0, 0, '-', None, None], 'cloud_type': [0, 1, '-', None, None],'P_kw_norm': [1, 0, '-', None, None],\n", - " 'GHI_cs': [2, 0, '-', None, None], 'cloud_type_cs': [2, 1, '-', None, None],'P_kw_norm_cs': [3, 0, '-', None, None]\n", + " \"GHI\": [0, 0, \"-\", None, None],\n", + " \"cloud_type\": [0, 1, \"-\", None, None],\n", + " \"P_kw_norm\": [1, 0, \"-\", None, None],\n", + " \"GHI_cs\": [2, 0, \"-\", None, None],\n", + " \"cloud_type_cs\": [2, 1, \"-\", None, None],\n", + " \"P_kw_norm_cs\": [3, 0, \"-\", None, None],\n", "}\n", "\n", - "color_nights=False\n", + "color_nights = False\n", "# color_by = 'group'\n", - "color_by = 'attribute'\n", - "ax_digit = '1.1f'\n", - "a=my_plot4(start_time, end_time, df0, plt_config=plt_config, ax_digit= ax_digit,\n", - " group_attr='site_id', time_attr='t_stamp', color_nights=color_nights,cmap='plasma',\n", - " figsize=[16/2.54,1.3], same_scale=1, fontsize=5, fontname='DejaVu Sans', plot_total=False, plot_total_func=['sum', [lambda x: max(x), 'max']], \n", - " num_ticks=num_ticks, num_yticks=10, dpi=200, x_format= '%H:%M', \n", - " legend_loc=['upper left', 'upper right', 'center left','upper left', 'upper right', 'center left'], x_label=x_label, y_labels=y_labels, color_by=color_by,\n", - "plot_period=np.timedelta64(1, 'D'), save_as=save_as, rotation = 60, step=0, gridwidth=[0.2, .2], legend_join='-', title='', \n", - "legend_i=0, title_i=0, only1title=0, onlyntime=0, show=False)\n", + "color_by = \"attribute\"\n", + "ax_digit = \"1.1f\"\n", + "a = my_plot4(\n", + " start_time,\n", + " end_time,\n", + " df0,\n", + " plt_config=plt_config,\n", + " ax_digit=ax_digit,\n", + " group_attr=\"site_id\",\n", + " time_attr=\"t_stamp\",\n", + " color_nights=color_nights,\n", + " cmap=\"plasma\",\n", + " figsize=[16 / 2.54, 1.3],\n", + " same_scale=1,\n", + " fontsize=5,\n", + " fontname=\"DejaVu Sans\",\n", + " plot_total=False,\n", + " plot_total_func=[\"sum\", [lambda x: max(x), \"max\"]],\n", + " num_ticks=num_ticks,\n", + " num_yticks=10,\n", + " dpi=200,\n", + " x_format=\"%H:%M\",\n", + " legend_loc=[\n", + " \"upper left\",\n", + " \"upper right\",\n", + " \"center left\",\n", + " \"upper left\",\n", + " \"upper right\",\n", + " \"center left\",\n", + " ],\n", + " x_label=x_label,\n", + " y_labels=y_labels,\n", + " color_by=color_by,\n", + " plot_period=np.timedelta64(1, \"D\"),\n", + " save_as=save_as,\n", + " rotation=60,\n", + " step=0,\n", + " gridwidth=[0.2, 0.2],\n", + " legend_join=\"-\",\n", + " title=\"\",\n", + " legend_i=0,\n", + " title_i=0,\n", + " only1title=0,\n", + " onlyntime=0,\n", + " show=False,\n", + ")\n", "a.do()" ] } diff --git a/SolA2024_Analysis/ghi_pv_estimator_general/Write_All_uncartailedPV.ipynb b/SolA2024_Analysis/ghi_pv_estimator_general/Write_All_uncartailedPV.ipynb index 8b47c63..cd3e7a9 100644 --- a/SolA2024_Analysis/ghi_pv_estimator_general/Write_All_uncartailedPV.ipynb +++ b/SolA2024_Analysis/ghi_pv_estimator_general/Write_All_uncartailedPV.ipynb @@ -7,10 +7,11 @@ "metadata": {}, "outputs": [], "source": [ - "from Data_query.trino_config import *\n", "import numpy as np\n", - "from visualisation import *\n", - "import pytz" + "import pytz\n", + "\n", + "from Data_query.trino_config import *\n", + "from visualisation import *" ] }, { @@ -38,7 +39,7 @@ "metadata": {}, "outputs": [], "source": [ - "sleep(120)\n" + "sleep(120)" ] }, { @@ -71,7 +72,7 @@ "big_workers = 1\n", "workers = 0\n", "num_workers = max(workers, big_workers)\n", - "ensure_trino_running(worker_desired_count = workers, big_worker_desired_count=big_workers)\n", + "ensure_trino_running(worker_desired_count=workers, big_worker_desired_count=big_workers)\n", "sleep(60)" ] }, @@ -806,10 +807,14 @@ } ], "source": [ - "num_parts=3\n", - "time_bin_interval = '5' # in minutes\n", + "num_parts = 3\n", + "time_bin_interval = \"5\" # in minutes\n", "model = \"pv_ghi_norm_model\"\n", - "acceptible_sites = ', '.join(map(str, pd.read_csv('mape<50_sites.csv')['site_id'].tolist()))\n", + "acceptible_sites = \", \".join(\n", + " map(str, pd.read_csv(\"mape<50_sites.csv\")[\"site_id\"].tolist())\n", + ")\n", + "\n", + "\n", "def run_func(args):\n", " year, month, part = args\n", " # time_filter = f\"year = {year} and month = {month}\"\n", @@ -857,17 +862,25 @@ " where P_kw_norm_est is not null\n", " \"\"\")\n", "\n", - " # \n", + " #\n", "\n", " # sleep(10)\n", " print(f\"Completed {time_filter}, part {part}\")\n", " return df\n", - "tasks = [(year, month, part) for year in (2024, ) for month in range(1, 2) \n", - " for part in range(0, num_parts)]\n", - " # for split_cons in ['system.bucket(postcode, 16) > -1'] ]\n", - " \n", - "try: \n", - " df = trino_parallel_batch(run_func, tasks, num_workers=num_workers, batch_size=num_workers)\n", + "\n", + "\n", + "tasks = [\n", + " (year, month, part)\n", + " for year in (2024,)\n", + " for month in range(1, 2)\n", + " for part in range(0, num_parts)\n", + "]\n", + "# for split_cons in ['system.bucket(postcode, 16) > -1'] ]\n", + "\n", + "try:\n", + " df = trino_parallel_batch(\n", + " run_func, tasks, num_workers=num_workers, batch_size=num_workers\n", + " )\n", "except Exception as e:\n", " print(f\"Error during data retrieval: {e}\")\n", "finally:\n", diff --git a/SolA2024_Analysis/ghi_pv_estimator_general/Write_structured_data.ipynb b/SolA2024_Analysis/ghi_pv_estimator_general/Write_structured_data.ipynb index df17dec..8d9edb2 100644 --- a/SolA2024_Analysis/ghi_pv_estimator_general/Write_structured_data.ipynb +++ b/SolA2024_Analysis/ghi_pv_estimator_general/Write_structured_data.ipynb @@ -7,10 +7,11 @@ "metadata": {}, "outputs": [], "source": [ - "from Data_query.trino_config import *\n", "import numpy as np\n", - "from visualisation import *\n", - "import pytz" + "import pytz\n", + "\n", + "from Data_query.trino_config import *\n", + "from visualisation import *" ] }, { @@ -38,7 +39,7 @@ "metadata": {}, "outputs": [], "source": [ - "sleep(120)\n" + "sleep(120)" ] }, { @@ -71,7 +72,7 @@ "big_workers = 1\n", "workers = 0\n", "num_workers = max(workers, big_workers)\n", - "ensure_trino_running(worker_desired_count = workers, big_worker_desired_count=big_workers)\n", + "ensure_trino_running(worker_desired_count=workers, big_worker_desired_count=big_workers)\n", "sleep(60)" ] }, @@ -152,14 +153,18 @@ } ], "source": [ - "num_parts=2\n", - "time_bin_interval = '5' # in minutes\n", + "num_parts = 2\n", + "time_bin_interval = \"5\" # in minutes\n", + "\n", + "\n", "def run_func(args):\n", " year, month, split_cons, part = args\n", " # time_filter = f\"year = {year} and month = {month}\"\n", " time_filter = f\"year = {year}\"\n", " part_filter = f\"site_id % {num_parts} = {part}\"\n", - " meta_filters = f\"is_pv=True and {split_cons} and flex_export_detected=False and {part_filter}\"\n", + " meta_filters = (\n", + " f\"is_pv=True and {split_cons} and flex_export_detected=False and {part_filter}\"\n", + " )\n", " df = iceberg_exec(f\"\"\"\n", " insert into structured_data\n", " with data as \n", @@ -336,18 +341,28 @@ " select * from strcutured_data\n", " \"\"\")\n", "\n", - " # \n", + " #\n", "\n", " # sleep(10)\n", - " print(f\"Completed {time_filter}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}, part {part}\")\n", + " print(\n", + " f\"Completed {time_filter}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}, part {part}\"\n", + " )\n", " return df\n", - "tasks = [(year, month, split_cons, part) for year in (2024, ) for month in range(1, 2) \n", - " for split_cons in [f'system.bucket(postcode, 16) = {i}' for i in range(14, 16)]\n", - " for part in range(0, num_parts)]\n", - " # for split_cons in ['system.bucket(postcode, 16) > -1'] ]\n", - " \n", - "try: \n", - " df = trino_parallel_batch(run_func, tasks, num_workers=num_workers, batch_size=num_workers)\n", + "\n", + "\n", + "tasks = [\n", + " (year, month, split_cons, part)\n", + " for year in (2024,)\n", + " for month in range(1, 2)\n", + " for split_cons in [f\"system.bucket(postcode, 16) = {i}\" for i in range(14, 16)]\n", + " for part in range(0, num_parts)\n", + "]\n", + "# for split_cons in ['system.bucket(postcode, 16) > -1'] ]\n", + "\n", + "try:\n", + " df = trino_parallel_batch(\n", + " run_func, tasks, num_workers=num_workers, batch_size=num_workers\n", + " )\n", "except Exception as e:\n", " print(f\"Error during data retrieval: {e}\")\n", "finally:\n", diff --git a/SolA2024_Analysis/ghi_pv_estimator_general/model_ghi_norm.ipynb b/SolA2024_Analysis/ghi_pv_estimator_general/model_ghi_norm.ipynb index 6160586..b93e958 100644 --- a/SolA2024_Analysis/ghi_pv_estimator_general/model_ghi_norm.ipynb +++ b/SolA2024_Analysis/ghi_pv_estimator_general/model_ghi_norm.ipynb @@ -7,10 +7,11 @@ "metadata": {}, "outputs": [], "source": [ - "from Data_query.trino_config import *\n", "import numpy as np\n", - "from visualisation import *\n", - "import pytz" + "import pytz\n", + "\n", + "from Data_query.trino_config import *\n", + "from visualisation import *" ] }, { @@ -59,7 +60,7 @@ "big_workers = 1\n", "workers = 0\n", "num_workers = max(workers, big_workers)\n", - "ensure_trino_running(worker_desired_count = workers, big_worker_desired_count=big_workers)\n", + "ensure_trino_running(worker_desired_count=workers, big_worker_desired_count=big_workers)\n", "# sleep(60)" ] }, @@ -142,7 +143,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Some sites are excluded because no CS day profile is detected for them. For example, site 1525233041, cs_day is 2024-01-21. But no ts data is detected for that day. " + "# Some sites are excluded because no CS day profile is detected for them. For example, site 1525233041, cs_day is 2024-01-21. But no ts data is detected for that day." ] }, { @@ -169,9 +170,7 @@ " a double,\n", " b double,\n", " n BIGINT\n", - " )\"\"\"\n", - " )\n", - " " + " )\"\"\")" ] }, { @@ -210,8 +209,10 @@ } ], "source": [ - "num_parts=1\n", - "time_bin_interval = '5' # in minutes\n", + "num_parts = 1\n", + "time_bin_interval = \"5\" # in minutes\n", + "\n", + "\n", "def run_func(args):\n", " year, month, part = args\n", " time_filter = f\"year = {year}\"\n", @@ -251,28 +252,34 @@ " select * from model\n", " \n", " \"\"\")\n", - "# this is the current model:\n", - "# P_norm/P_norm_cs = a + b * GHI/GHI_cs --> P_norm = P_norm_cs * (a + b * GHI/GHI_cs)\n", - "# 1 = a + b * 1 : this applies on clear sky days\n", - "# < 1? = a + b * (< 1?)\n", + " # this is the current model:\n", + " # P_norm/P_norm_cs = a + b * GHI/GHI_cs --> P_norm = P_norm_cs * (a + b * GHI/GHI_cs)\n", + " # 1 = a + b * 1 : this applies on clear sky days\n", + " # < 1? = a + b * (< 1?)\n", "\n", - "# VoltVar curtailment condition:\n", - "# S_norm >= 1 and P_norm < 1 theoretically\n", + " # VoltVar curtailment condition:\n", + " # S_norm >= 1 and P_norm < 1 theoretically\n", "\n", - "# VoltWatt curtailment condition:\n", - "# V> 253 and P_norm < 1 theoretically\n", + " # VoltWatt curtailment condition:\n", + " # V> 253 and P_norm < 1 theoretically\n", "\n", - "# Alternative models that can be tried:\n", - "# P = a + b * GHI but on a tod on a season\n", - "# P_cs = a + b * GHI_cs\n", - "# P_norm = a + b * GHI + c * GHI_cs\n", - "# \n", + " # Alternative models that can be tried:\n", + " # P = a + b * GHI but on a tod on a season\n", + " # P_cs = a + b * GHI_cs\n", + " # P_norm = a + b * GHI + c * GHI_cs\n", + " #\n", " print(f\"Completed {time_filter}, part {part}\")\n", " return df\n", - "tasks = [(year, month, part) for year in (2024, ) for month in range(1, 2) \n", - " for part in range(0, num_parts)]\n", - " \n", - "try: \n", + "\n", + "\n", + "tasks = [\n", + " (year, month, part)\n", + " for year in (2024,)\n", + " for month in range(1, 2)\n", + " for part in range(0, num_parts)\n", + "]\n", + "\n", + "try:\n", " res = trino_parallel(run_func, tasks, num_workers=num_workers)\n", "except Exception as e:\n", " print(f\"Error during data retrieval: {e}\")\n", @@ -300,7 +307,7 @@ } ], "source": [ - "365-52*2" + "365 - 52 * 2" ] }, { diff --git a/SolA2024_Analysis/ghi_pv_estimator_general/single_site_model.ipynb b/SolA2024_Analysis/ghi_pv_estimator_general/single_site_model.ipynb index 99b1822..94d0ad9 100644 --- a/SolA2024_Analysis/ghi_pv_estimator_general/single_site_model.ipynb +++ b/SolA2024_Analysis/ghi_pv_estimator_general/single_site_model.ipynb @@ -7,11 +7,13 @@ "metadata": {}, "outputs": [], "source": [ - "from Data_query.trino_config import *\n", + "import time\n", + "\n", "import numpy as np\n", - "from visualisation import *\n", "import pytz\n", - "import time\n" + "\n", + "from Data_query.trino_config import *\n", + "from visualisation import *" ] }, { @@ -60,7 +62,7 @@ "big_workers = 1\n", "workers = 0\n", "num_workers = max(workers, big_workers)\n", - "ensure_trino_running(worker_desired_count = workers, big_worker_desired_count=big_workers)\n", + "ensure_trino_running(worker_desired_count=workers, big_worker_desired_count=big_workers)\n", "# sleep(40)" ] }, @@ -143,7 +145,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Some sites are excluded because no CS day profile is detected for them. For example, site 1525233041, cs_day is 2024-01-21. But no ts data is detected for that day. " + "# Some sites are excluded because no CS day profile is detected for them. For example, site 1525233041, cs_day is 2024-01-21. But no ts data is detected for that day." ] }, { @@ -170,9 +172,7 @@ " a double,\n", " b double,\n", " n BIGINT\n", - " )\"\"\"\n", - " )\n", - " " + " )\"\"\")" ] }, { @@ -209,8 +209,10 @@ } ], "source": [ - "num_parts=1\n", - "time_bin_interval = '5' # in minutes\n", + "num_parts = 1\n", + "time_bin_interval = \"5\" # in minutes\n", + "\n", + "\n", "def run_func(args):\n", " t00 = time.time()\n", " year, month, part = args\n", @@ -256,14 +258,21 @@ " \n", " \"\"\")\n", "\n", - " print(f\"Completed {time_filter}, part {part}, time taken: {round(time.time() - t00, 2)} seconds\")\n", + " print(\n", + " f\"Completed {time_filter}, part {part}, time taken: {round(time.time() - t00, 2)} seconds\"\n", + " )\n", " return df\n", "\n", - "tasks = [(year, month, part) for year in (2024, ) for month in range(1, 2) \n", - " for part in range(0, num_parts)]\n", - " # for split_cons in ['system.bucket(postcode, 16) > -1'] ]\n", - " \n", - "try: \n", + "\n", + "tasks = [\n", + " (year, month, part)\n", + " for year in (2024,)\n", + " for month in range(1, 2)\n", + " for part in range(0, num_parts)\n", + "]\n", + "# for split_cons in ['system.bucket(postcode, 16) > -1'] ]\n", + "\n", + "try:\n", " res = trino_parallel(run_func, tasks, num_workers=num_workers)\n", "except Exception as e:\n", " print(f\"Error during data retrieval: {e}\")\n", @@ -291,7 +300,7 @@ } ], "source": [ - "365-52*2" + "365 - 52 * 2" ] }, { @@ -708,8 +717,10 @@ } ], "source": [ - "num_parts=1\n", - "time_bin_interval = '5' # in minutes\n", + "num_parts = 1\n", + "time_bin_interval = \"5\" # in minutes\n", + "\n", + "\n", "def run_func(args):\n", " year, month, split_cons, part = args\n", " time_filter = f\"year = {year}\"\n", @@ -903,14 +914,22 @@ " \n", " \"\"\")\n", "\n", - " print(f\"Completed {time_filter}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}, part {part}\")\n", + " print(\n", + " f\"Completed {time_filter}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}, part {part}\"\n", + " )\n", " return df\n", - "tasks = [(year, month, split_cons, part) for year in (2024, ) for month in range(1, 2) \n", - " for split_cons in [f'system.bucket(postcode, 16) = {i}' for i in range(16)]\n", - " for part in range(0, num_parts)]\n", - " # for split_cons in ['system.bucket(postcode, 16) > -1'] ]\n", - " \n", - "try: \n", + "\n", + "\n", + "tasks = [\n", + " (year, month, split_cons, part)\n", + " for year in (2024,)\n", + " for month in range(1, 2)\n", + " for split_cons in [f\"system.bucket(postcode, 16) = {i}\" for i in range(16)]\n", + " for part in range(0, num_parts)\n", + "]\n", + "# for split_cons in ['system.bucket(postcode, 16) > -1'] ]\n", + "\n", + "try:\n", " res = trino_parallel(run_func, tasks, num_workers=num_workers)\n", "except Exception as e:\n", " print(f\"Error during data retrieval: {e}\")\n", diff --git a/SolA2024_Analysis/ghi_pv_estimator_general/split_days.ipynb b/SolA2024_Analysis/ghi_pv_estimator_general/split_days.ipynb index d616e20..f80b2f6 100644 --- a/SolA2024_Analysis/ghi_pv_estimator_general/split_days.ipynb +++ b/SolA2024_Analysis/ghi_pv_estimator_general/split_days.ipynb @@ -7,10 +7,11 @@ "metadata": {}, "outputs": [], "source": [ - "from Data_query.trino_config import *\n", "import numpy as np\n", - "from visualisation import *\n", - "import pytz" + "import pytz\n", + "\n", + "from Data_query.trino_config import *\n", + "from visualisation import *" ] }, { @@ -60,7 +61,7 @@ } ], "source": [ - "ensure_trino_running(worker_desired_count = workers, big_worker_desired_count=big_workers)\n", + "ensure_trino_running(worker_desired_count=workers, big_worker_desired_count=big_workers)\n", "sleep(60)" ] }, @@ -143,7 +144,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Some sites are excluded because no CS day profile is detected for them. For example, site 1525233041, cs_day is 2024-01-21. But no ts data is detected for that day. " + "# Some sites are excluded because no CS day profile is detected for them. For example, site 1525233041, cs_day is 2024-01-21. But no ts data is detected for that day." ] }, { @@ -168,9 +169,7 @@ " site_id BIGINT,\n", " actual_day DATE,\n", " day_type varchar\n", - " )\"\"\"\n", - " )\n", - " " + " )\"\"\")" ] }, { @@ -203,14 +202,18 @@ } ], "source": [ - "num_parts=1\n", - "time_bin_interval = '30' # in minutes\n", + "num_parts = 1\n", + "time_bin_interval = \"30\" # in minutes\n", + "\n", + "\n", "def run_func(args):\n", " year, month, split_cons, part = args\n", " # time_filter = f\"year = {year} and month = {month}\"\n", " time_filter = f\"year = {year}\"\n", " part_filter = f\"site_id % {num_parts} = {part}\"\n", - " meta_filters = f\"is_pv=True and {split_cons} and flex_export_detected=False and {part_filter}\"\n", + " meta_filters = (\n", + " f\"is_pv=True and {split_cons} and flex_export_detected=False and {part_filter}\"\n", + " )\n", " # meta_filters = f\"is_pv=True and {split_cons} and flex_export_detected=False and site_id in (1669657679,1947677239)\"\n", " df = iceberg_exec(f\"\"\"\n", " insert into split_days\n", @@ -413,12 +416,18 @@ " # print(f\"Completed {time_filter}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}, part {part}, count: {df['site_id'].nunique()}\")\n", " # sleep(3)\n", " return df\n", - "tasks = [(year, month, split_cons, part) for year in (2024, ) for month in range(1, 2) \n", - " for split_cons in [f'system.bucket(postcode, 16) = {i}' for i in range(0, 16)]\n", - " for part in range(0, num_parts)]\n", - " # for split_cons in ['system.bucket(postcode, 16) > -1'] ]\n", - " \n", - "try: \n", + "\n", + "\n", + "tasks = [\n", + " (year, month, split_cons, part)\n", + " for year in (2024,)\n", + " for month in range(1, 2)\n", + " for split_cons in [f\"system.bucket(postcode, 16) = {i}\" for i in range(0, 16)]\n", + " for part in range(0, num_parts)\n", + "]\n", + "# for split_cons in ['system.bucket(postcode, 16) > -1'] ]\n", + "\n", + "try:\n", " res = trino_parallel(run_func, tasks, num_workers=num_workers)\n", "except Exception as e:\n", " print(f\"Error during data retrieval: {e}\")\n", diff --git a/SolA2024_Analysis/ghi_pv_estimator_general/val_on_single_sites.ipynb b/SolA2024_Analysis/ghi_pv_estimator_general/val_on_single_sites.ipynb index aa67a09..c9a69f2 100644 --- a/SolA2024_Analysis/ghi_pv_estimator_general/val_on_single_sites.ipynb +++ b/SolA2024_Analysis/ghi_pv_estimator_general/val_on_single_sites.ipynb @@ -7,10 +7,11 @@ "metadata": {}, "outputs": [], "source": [ - "from Data_query.trino_config import *\n", "import numpy as np\n", - "from visualisation import *\n", - "import pytz" + "import pytz\n", + "\n", + "from Data_query.trino_config import *\n", + "from visualisation import *" ] }, { @@ -61,7 +62,7 @@ "big_workers = 0\n", "workers = 1\n", "num_workers = max(workers, big_workers)\n", - "ensure_trino_running(worker_desired_count = workers, big_worker_desired_count=big_workers)\n", + "ensure_trino_running(worker_desired_count=workers, big_worker_desired_count=big_workers)\n", "# sleep(40)" ] }, @@ -308,9 +309,10 @@ } ], "source": [ + "num_parts = 1\n", + "time_bin_interval = \"5\" # in minutes\n", + "\n", "\n", - "num_parts=1\n", - "time_bin_interval = '5' # in minutes\n", "def run_func(args):\n", " year, month, part = args\n", " # time_filter = f\"year = {year} and month = {month}\"\n", @@ -359,24 +361,34 @@ " FROM validation_on_test_data\n", " \"\"\")\n", "\n", - " # \n", + " #\n", "\n", " # sleep(20)\n", " # print(f\"Completed {time_filter}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}, part {part}, count: {df['site_id'].nunique()}\")\n", " return df\n", - "tasks = [(year, month, part) for year in (2024, ) for month in range(1, 2) \n", - " for part in range(0, num_parts)]\n", - " # for split_cons in ['system.bucket(postcode, 16) > -1'] ]\n", - " \n", - "try: \n", + "\n", + "\n", + "tasks = [\n", + " (year, month, part)\n", + " for year in (2024,)\n", + " for month in range(1, 2)\n", + " for part in range(0, num_parts)\n", + "]\n", + "# for split_cons in ['system.bucket(postcode, 16) > -1'] ]\n", + "\n", + "try:\n", " res_test = trino_parallel(run_func, tasks, num_workers=1)\n", "except Exception as e:\n", " print(f\"Error during data retrieval: {e}\")\n", "finally:\n", " # stop_trino()\n", " pass\n", - "res_test['t_stamp'] = pd.to_datetime(res_test['t_stamp']).dt.tz_localize('utc').dt.tz_convert(pytz.FixedOffset(10*60))\n", - "res_test['GHI_norm'] = res_test['GHI_norm'].fillna(-1)\n", + "res_test[\"t_stamp\"] = (\n", + " pd.to_datetime(res_test[\"t_stamp\"])\n", + " .dt.tz_localize(\"utc\")\n", + " .dt.tz_convert(pytz.FixedOffset(10 * 60))\n", + ")\n", + "res_test[\"GHI_norm\"] = res_test[\"GHI_norm\"].fillna(-1)\n", "res_test" ] }, @@ -460,41 +472,84 @@ "# sample_site_id = 1792599725\n", "# sample_site_id = 696192939\n", "df0 = res_test.query(f\"site_id=={sample_site_id}\").reset_index(drop=True)\n", - "t0 = df0['t_stamp'].min()\n", - "t1 = df0['t_stamp'].max()\n", + "t0 = df0[\"t_stamp\"].min()\n", + "t1 = df0[\"t_stamp\"].max()\n", "t0 = t0 + pd.Timedelta(days=38)\n", "t1 = t0 + pd.Timedelta(days=18)\n", "# start_time = t0.strftime('%Y-%m-%d %H:%M:%S%z')\n", "# end_time = t1.strftime('%Y-%m-%d %H:%M:%S%z')\n", - "if t0.time() == pd.Timestamp('00:00:00').time():\n", - " start_time = t0.strftime('%Y-%m-%d %H:%M:%S%z')\n", + "if t0.time() == pd.Timestamp(\"00:00:00\").time():\n", + " start_time = t0.strftime(\"%Y-%m-%d %H:%M:%S%z\")\n", "else:\n", - " start_time = (t0 + pd.Timedelta(days=1)).replace(hour=0, minute=0, second=0).strftime('%Y-%m-%d %H:%M:%S%z')\n", - "if t1.time() == pd.Timestamp('00:00:00').time():\n", - " end_time = t1.strftime('%Y-%m-%d %H:%M:%S%z')\n", + " start_time = (\n", + " (t0 + pd.Timedelta(days=1))\n", + " .replace(hour=0, minute=0, second=0)\n", + " .strftime(\"%Y-%m-%d %H:%M:%S%z\")\n", + " )\n", + "if t1.time() == pd.Timestamp(\"00:00:00\").time():\n", + " end_time = t1.strftime(\"%Y-%m-%d %H:%M:%S%z\")\n", "else:\n", - " end_time = t1.replace(hour=0, minute=0, second=0).strftime('%Y-%m-%d %H:%M:%S%z')\n", + " end_time = t1.replace(hour=0, minute=0, second=0).strftime(\"%Y-%m-%d %H:%M:%S%z\")\n", "\n", - "num_ticks = 24*2+1\n", - "save_as = f'Figures/GHInorm_{sample_site_id}.jpeg'\n", - "x_label = 'time'\n", - "y_labels = ['GHI_norm', 'Voltage (V)', 'Active Power (kW)', 'Active Power (kW)', 'Active Power (kW)']\n", + "num_ticks = 24 * 2 + 1\n", + "save_as = f\"Figures/GHInorm_{sample_site_id}.jpeg\"\n", + "x_label = \"time\"\n", + "y_labels = [\n", + " \"GHI_norm\",\n", + " \"Voltage (V)\",\n", + " \"Active Power (kW)\",\n", + " \"Active Power (kW)\",\n", + " \"Active Power (kW)\",\n", + "]\n", "plt_config = {\n", - " 'GHI_norm': [0, 0, '-', None, None], 'V': [0, 1, '-', None, None],\n", - " 'P_kw_norm': [1, 0, '-', None, None],'P_kw_norm_est': [1, 0, '-', None, None],'P_kw_norm_cs': [1, 0, '-', None, None]\n", + " \"GHI_norm\": [0, 0, \"-\", None, None],\n", + " \"V\": [0, 1, \"-\", None, None],\n", + " \"P_kw_norm\": [1, 0, \"-\", None, None],\n", + " \"P_kw_norm_est\": [1, 0, \"-\", None, None],\n", + " \"P_kw_norm_cs\": [1, 0, \"-\", None, None],\n", "}\n", "\n", - "color_nights=False\n", + "color_nights = False\n", "# color_by = 'group'\n", - "color_by = 'attribute'\n", - "ax_digit = '1.1f'\n", - "a=my_plot4(start_time, end_time, df0, plt_config=plt_config, ax_digit= ax_digit,\n", - " group_attr='site_id', time_attr='t_stamp', color_nights=color_nights,cmap='plasma',\n", - " figsize=[16/2.54,1.3], same_scale=1, fontsize=5, fontname='DejaVu Sans', plot_total=False, plot_total_func=['sum', [lambda x: max(x), 'max']], \n", - " num_ticks=num_ticks, num_yticks=10, dpi=200, x_format= '%H:%M', \n", - " legend_loc=['upper left', 'upper right', 'center left','upper left', 'lower left'], x_label=x_label, y_labels=y_labels, color_by=color_by,\n", - "plot_period=np.timedelta64(1, 'D'), save_as=save_as, rotation = 60, step=0, gridwidth=[0.2, .2], legend_join='-', title='', \n", - "legend_i=0, title_i=0, only1title=0, onlyntime=0, show=False)\n", + "color_by = \"attribute\"\n", + "ax_digit = \"1.1f\"\n", + "a = my_plot4(\n", + " start_time,\n", + " end_time,\n", + " df0,\n", + " plt_config=plt_config,\n", + " ax_digit=ax_digit,\n", + " group_attr=\"site_id\",\n", + " time_attr=\"t_stamp\",\n", + " color_nights=color_nights,\n", + " cmap=\"plasma\",\n", + " figsize=[16 / 2.54, 1.3],\n", + " same_scale=1,\n", + " fontsize=5,\n", + " fontname=\"DejaVu Sans\",\n", + " plot_total=False,\n", + " plot_total_func=[\"sum\", [lambda x: max(x), \"max\"]],\n", + " num_ticks=num_ticks,\n", + " num_yticks=10,\n", + " dpi=200,\n", + " x_format=\"%H:%M\",\n", + " legend_loc=[\"upper left\", \"upper right\", \"center left\", \"upper left\", \"lower left\"],\n", + " x_label=x_label,\n", + " y_labels=y_labels,\n", + " color_by=color_by,\n", + " plot_period=np.timedelta64(1, \"D\"),\n", + " save_as=save_as,\n", + " rotation=60,\n", + " step=0,\n", + " gridwidth=[0.2, 0.2],\n", + " legend_join=\"-\",\n", + " title=\"\",\n", + " legend_i=0,\n", + " title_i=0,\n", + " only1title=0,\n", + " onlyntime=0,\n", + " show=False,\n", + ")\n", "a.do()" ] }, diff --git a/SolA2024_Analysis/ghi_pv_estimator_general/val_on_test.ipynb b/SolA2024_Analysis/ghi_pv_estimator_general/val_on_test.ipynb index eb22e22..d666095 100644 --- a/SolA2024_Analysis/ghi_pv_estimator_general/val_on_test.ipynb +++ b/SolA2024_Analysis/ghi_pv_estimator_general/val_on_test.ipynb @@ -7,10 +7,11 @@ "metadata": {}, "outputs": [], "source": [ - "from Data_query.trino_config import *\n", "import numpy as np\n", - "from visualisation import *\n", - "import pytz" + "import pytz\n", + "\n", + "from Data_query.trino_config import *\n", + "from visualisation import *" ] }, { @@ -51,7 +52,7 @@ "big_workers = 1\n", "workers = 0\n", "num_workers = max(workers, big_workers)\n", - "ensure_trino_running(worker_desired_count = workers, big_worker_desired_count=big_workers)\n", + "ensure_trino_running(worker_desired_count=workers, big_worker_desired_count=big_workers)\n", "sleep(40)" ] }, @@ -134,7 +135,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Some sites are excluded because no CS day profile is detected for them. For example, site 1525233041, cs_day is 2024-01-21. But no ts data is detected for that day. " + "# Some sites are excluded because no CS day profile is detected for them. For example, site 1525233041, cs_day is 2024-01-21. But no ts data is detected for that day." ] }, { @@ -337,14 +338,18 @@ "# num_workers = max(workers, big_workers)\n", "# ensure_trino_running(worker_desired_count = workers, big_worker_desired_count=big_workers)\n", "# sleep(20)\n", - "num_parts=7\n", + "num_parts = 7\n", + "\n", + "\n", "def run_func(args):\n", " year, month, split_cons, part = args\n", " # time_filter = f\"year = {year} and month = {month}\"\n", " time_filter = f\"year = {year}\"\n", " # part_filter = f\"postcode % {num_parts} = {part}\"\n", " part_filter = f\"site_id % {num_parts} = {part}\"\n", - " meta_filters = f\"is_pv=True and {split_cons} and flex_export_detected=False and {part_filter}\"\n", + " meta_filters = (\n", + " f\"is_pv=True and {split_cons} and flex_export_detected=False and {part_filter}\"\n", + " )\n", " # meta_filters = f\"is_pv=True and {split_cons} and flex_export_detected=False and site_id in (1669657679,1947677239)\"\n", " df = iceberg_sql(f\"\"\"\n", " with data as \n", @@ -558,24 +563,34 @@ " FROM validation_on_test_data\n", " \"\"\")\n", "\n", - " # \n", + " #\n", "\n", " # sleep(20)\n", " # print(f\"Completed {time_filter}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}, part {part}, count: {df['site_id'].nunique()}\")\n", " return df\n", - "tasks = [(year, month, split_cons, part) for year in (2024, ) for month in range(1, 2) \n", - " for split_cons in [f'system.bucket(postcode, 16) = {i}' for i in range(0, 1)]\n", - " for part in range(0, 1)]\n", - " # for split_cons in ['system.bucket(postcode, 16) > -1'] ]\n", - " \n", - "try: \n", + "\n", + "\n", + "tasks = [\n", + " (year, month, split_cons, part)\n", + " for year in (2024,)\n", + " for month in range(1, 2)\n", + " for split_cons in [f\"system.bucket(postcode, 16) = {i}\" for i in range(0, 1)]\n", + " for part in range(0, 1)\n", + "]\n", + "# for split_cons in ['system.bucket(postcode, 16) > -1'] ]\n", + "\n", + "try:\n", " res_test = trino_parallel(run_func, tasks, num_workers=1)\n", "except Exception as e:\n", " print(f\"Error during data retrieval: {e}\")\n", "finally:\n", " # stop_trino()\n", " pass\n", - "res_test['t_stamp'] = pd.to_datetime(res_test['t_stamp']).dt.tz_localize('utc').dt.tz_convert(pytz.FixedOffset(10*60))\n", + "res_test[\"t_stamp\"] = (\n", + " pd.to_datetime(res_test[\"t_stamp\"])\n", + " .dt.tz_localize(\"utc\")\n", + " .dt.tz_convert(pytz.FixedOffset(10 * 60))\n", + ")\n", "res_test" ] }, @@ -602,7 +617,7 @@ } ], "source": [ - "res_test['site_id'].unique()" + "res_test[\"site_id\"].unique()" ] }, { @@ -906,11 +921,11 @@ "source": [ "sample_site_id = 1311557261\n", "df0 = res_test.query(f\"site_id=={sample_site_id}\").reset_index(drop=True)\n", - "t0 = df0['t_stamp'].min()\n", - "t1 = df0['t_stamp'].max()\n", + "t0 = df0[\"t_stamp\"].min()\n", + "t1 = df0[\"t_stamp\"].max()\n", "# t1 = t0 + pd.Timedelta(days=6)\n", - "start_time = t0.strftime('%Y-%m-%d %H:%M:%S%z')\n", - "end_time = t1.strftime('%Y-%m-%d %H:%M:%S%z')\n", + "start_time = t0.strftime(\"%Y-%m-%d %H:%M:%S%z\")\n", + "end_time = t1.strftime(\"%Y-%m-%d %H:%M:%S%z\")\n", "# if t0.time() == pd.Timestamp('00:00:00').time():\n", "# start_time = t0.strftime('%Y-%m-%d %H:%M:%S%z')\n", "# else:\n", @@ -920,26 +935,57 @@ "# else:\n", "# end_time = t1.replace(hour=0, minute=0, second=0).strftime('%Y-%m-%d %H:%M:%S%z')\n", "\n", - "num_ticks = 24*2+1\n", - "save_as = 'Figures/test_mape25.jpeg'\n", - "x_label = 'time'\n", - "y_labels = ['GHI', 'Active Power (kW)', 'Active Power (kW)']\n", + "num_ticks = 24 * 2 + 1\n", + "save_as = \"Figures/test_mape25.jpeg\"\n", + "x_label = \"time\"\n", + "y_labels = [\"GHI\", \"Active Power (kW)\", \"Active Power (kW)\"]\n", "plt_config = {\n", - " 'GHI': [0, 0, '-', None, None],\n", - " 'P_kw_norm': [1, 0, '-', None, None],'P_kw_norm_est': [1, 0, '-', None, None],\n", + " \"GHI\": [0, 0, \"-\", None, None],\n", + " \"P_kw_norm\": [1, 0, \"-\", None, None],\n", + " \"P_kw_norm_est\": [1, 0, \"-\", None, None],\n", "}\n", "\n", - "color_nights=False\n", + "color_nights = False\n", "# color_by = 'group'\n", - "color_by = 'attribute'\n", - "ax_digit = '1.1f'\n", - "a=my_plot4(start_time, end_time, df0, plt_config=plt_config, ax_digit= ax_digit,\n", - " group_attr='site_id', time_attr='t_stamp', color_nights=color_nights,cmap='plasma',\n", - " figsize=[16/2.54,1.3], same_scale=1, fontsize=5, fontname='DejaVu Sans', plot_total=False, plot_total_func=['sum', [lambda x: max(x), 'max']], \n", - " num_ticks=num_ticks, num_yticks=10, dpi=200, x_format= '%H:%M', \n", - " legend_loc=['upper left', 'upper right', 'center left','upper left'], x_label=x_label, y_labels=y_labels, color_by=color_by,\n", - "plot_period=np.timedelta64(1, 'D'), save_as=save_as, rotation = 60, step=0, gridwidth=[0.2, .2], legend_join='-', title='', \n", - "legend_i=0, title_i=0, only1title=0, onlyntime=0, show=False)\n", + "color_by = \"attribute\"\n", + "ax_digit = \"1.1f\"\n", + "a = my_plot4(\n", + " start_time,\n", + " end_time,\n", + " df0,\n", + " plt_config=plt_config,\n", + " ax_digit=ax_digit,\n", + " group_attr=\"site_id\",\n", + " time_attr=\"t_stamp\",\n", + " color_nights=color_nights,\n", + " cmap=\"plasma\",\n", + " figsize=[16 / 2.54, 1.3],\n", + " same_scale=1,\n", + " fontsize=5,\n", + " fontname=\"DejaVu Sans\",\n", + " plot_total=False,\n", + " plot_total_func=[\"sum\", [lambda x: max(x), \"max\"]],\n", + " num_ticks=num_ticks,\n", + " num_yticks=10,\n", + " dpi=200,\n", + " x_format=\"%H:%M\",\n", + " legend_loc=[\"upper left\", \"upper right\", \"center left\", \"upper left\"],\n", + " x_label=x_label,\n", + " y_labels=y_labels,\n", + " color_by=color_by,\n", + " plot_period=np.timedelta64(1, \"D\"),\n", + " save_as=save_as,\n", + " rotation=60,\n", + " step=0,\n", + " gridwidth=[0.2, 0.2],\n", + " legend_join=\"-\",\n", + " title=\"\",\n", + " legend_i=0,\n", + " title_i=0,\n", + " only1title=0,\n", + " onlyntime=0,\n", + " show=False,\n", + ")\n", "a.do()" ] } diff --git a/SolA2024_Analysis/ghi_pv_estimator_general/val_on_train.ipynb b/SolA2024_Analysis/ghi_pv_estimator_general/val_on_train.ipynb index 308a0ad..25236d0 100644 --- a/SolA2024_Analysis/ghi_pv_estimator_general/val_on_train.ipynb +++ b/SolA2024_Analysis/ghi_pv_estimator_general/val_on_train.ipynb @@ -7,10 +7,11 @@ "metadata": {}, "outputs": [], "source": [ - "from Data_query.trino_config import *\n", "import numpy as np\n", - "from visualisation import *\n", - "import pytz" + "import pytz\n", + "\n", + "from Data_query.trino_config import *\n", + "from visualisation import *" ] }, { @@ -51,7 +52,7 @@ "big_workers = 1\n", "workers = 0\n", "num_workers = max(workers, big_workers)\n", - "ensure_trino_running(worker_desired_count = workers, big_worker_desired_count=big_workers)\n", + "ensure_trino_running(worker_desired_count=workers, big_worker_desired_count=big_workers)\n", "sleep(40)" ] }, @@ -134,7 +135,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Some sites are excluded because no CS day profile is detected for them. For example, site 1525233041, cs_day is 2024-01-21. But no ts data is detected for that day. " + "# Some sites are excluded because no CS day profile is detected for them. For example, site 1525233041, cs_day is 2024-01-21. But no ts data is detected for that day." ] }, { @@ -313,14 +314,18 @@ "# num_workers = max(workers, big_workers)\n", "# ensure_trino_running(worker_desired_count = workers, big_worker_desired_count=big_workers)\n", "# sleep(20)\n", - "num_parts=7\n", + "num_parts = 7\n", + "\n", + "\n", "def run_func(args):\n", " year, month, split_cons, part = args\n", " # time_filter = f\"year = {year} and month = {month}\"\n", " time_filter = f\"year = {year}\"\n", " # part_filter = f\"postcode % {num_parts} = {part}\"\n", " part_filter = f\"site_id % {num_parts} = {part}\"\n", - " meta_filters = f\"is_pv=True and {split_cons} and flex_export_detected=False and {part_filter}\"\n", + " meta_filters = (\n", + " f\"is_pv=True and {split_cons} and flex_export_detected=False and {part_filter}\"\n", + " )\n", " # meta_filters = f\"is_pv=True and {split_cons} and flex_export_detected=False and site_id in (699345787)\"\n", " df = iceberg_sql(f\"\"\"\n", " with data as \n", @@ -516,24 +521,34 @@ " FROM validation_on_train_data\n", " \"\"\")\n", "\n", - " # \n", + " #\n", "\n", " # sleep(20)\n", " # print(f\"Completed {time_filter}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}, part {part}, count: {df['site_id'].nunique()}\")\n", " return df\n", - "tasks = [(year, month, split_cons, part) for year in (2024, ) for month in range(1, 2) \n", - " for split_cons in [f'system.bucket(postcode, 16) = {i}' for i in range(0, 1)]\n", - " for part in range(0, 1)]\n", - " # for split_cons in ['system.bucket(postcode, 16) > -1'] ]\n", - " \n", - "try: \n", + "\n", + "\n", + "tasks = [\n", + " (year, month, split_cons, part)\n", + " for year in (2024,)\n", + " for month in range(1, 2)\n", + " for split_cons in [f\"system.bucket(postcode, 16) = {i}\" for i in range(0, 1)]\n", + " for part in range(0, 1)\n", + "]\n", + "# for split_cons in ['system.bucket(postcode, 16) > -1'] ]\n", + "\n", + "try:\n", " res_train = trino_parallel(run_func, tasks, num_workers=1)\n", "except Exception as e:\n", " print(f\"Error during data retrieval: {e}\")\n", "finally:\n", " # stop_trino()\n", " pass\n", - "res_train['t_stamp'] = pd.to_datetime(res_train['t_stamp']).dt.tz_localize('utc').dt.tz_convert(pytz.FixedOffset(10*60))\n", + "res_train[\"t_stamp\"] = (\n", + " pd.to_datetime(res_train[\"t_stamp\"])\n", + " .dt.tz_localize(\"utc\")\n", + " .dt.tz_convert(pytz.FixedOffset(10 * 60))\n", + ")\n", "res_train" ] }, @@ -631,40 +646,75 @@ "source": [ "sample_site_id = 905749026\n", "df0 = res_train.query(f\"site_id=={sample_site_id}\").reset_index(drop=True)\n", - "t0 = df0['t_stamp'].min()\n", - "t1 = df0['t_stamp'].max()\n", + "t0 = df0[\"t_stamp\"].min()\n", + "t1 = df0[\"t_stamp\"].max()\n", "# t1 = t0 + pd.Timedelta(days=6)\n", "# start_time = t0.strftime('%Y-%m-%d %H:%M:%S%z')\n", "# end_time = t1.strftime('%Y-%m-%d %H:%M:%S%z')\n", - "if t0.time() == pd.Timestamp('00:00:00').time():\n", - " start_time = t0.strftime('%Y-%m-%d %H:%M:%S%z')\n", + "if t0.time() == pd.Timestamp(\"00:00:00\").time():\n", + " start_time = t0.strftime(\"%Y-%m-%d %H:%M:%S%z\")\n", "else:\n", - " start_time = (t0 + pd.Timedelta(days=1)).replace(hour=0, minute=0, second=0).strftime('%Y-%m-%d %H:%M:%S%z')\n", - "if t1.time() == pd.Timestamp('00:00:00').time():\n", - " end_time = t1.strftime('%Y-%m-%d %H:%M:%S%z')\n", + " start_time = (\n", + " (t0 + pd.Timedelta(days=1))\n", + " .replace(hour=0, minute=0, second=0)\n", + " .strftime(\"%Y-%m-%d %H:%M:%S%z\")\n", + " )\n", + "if t1.time() == pd.Timestamp(\"00:00:00\").time():\n", + " end_time = t1.strftime(\"%Y-%m-%d %H:%M:%S%z\")\n", "else:\n", - " end_time = t1.replace(hour=0, minute=0, second=0).strftime('%Y-%m-%d %H:%M:%S%z')\n", + " end_time = t1.replace(hour=0, minute=0, second=0).strftime(\"%Y-%m-%d %H:%M:%S%z\")\n", "\n", - "num_ticks = 24*2+1\n", - "save_as = 'Figures/train_mape25.jpeg'\n", - "x_label = 'time'\n", - "y_labels = ['GHI', 'Active Power (kW)', 'Active Power (kW)']\n", + "num_ticks = 24 * 2 + 1\n", + "save_as = \"Figures/train_mape25.jpeg\"\n", + "x_label = \"time\"\n", + "y_labels = [\"GHI\", \"Active Power (kW)\", \"Active Power (kW)\"]\n", "plt_config = {\n", - " 'GHI': [0, 0, '-', None, None],\n", - " 'P_kw_norm': [1, 0, '-', None, None],'P_kw_norm_est': [1, 0, '-', None, None],\n", + " \"GHI\": [0, 0, \"-\", None, None],\n", + " \"P_kw_norm\": [1, 0, \"-\", None, None],\n", + " \"P_kw_norm_est\": [1, 0, \"-\", None, None],\n", "}\n", "\n", - "color_nights=False\n", + "color_nights = False\n", "# color_by = 'group'\n", - "color_by = 'attribute'\n", - "ax_digit = '1.1f'\n", - "a=my_plot4(start_time, end_time, df0, plt_config=plt_config, ax_digit= ax_digit,\n", - " group_attr='site_id', time_attr='t_stamp', color_nights=color_nights,cmap='plasma',\n", - " figsize=[16/2.54,1.3], same_scale=1, fontsize=5, fontname='DejaVu Sans', plot_total=False, plot_total_func=['sum', [lambda x: max(x), 'max']], \n", - " num_ticks=num_ticks, num_yticks=10, dpi=200, x_format= '%H:%M', \n", - " legend_loc=['upper left', 'upper right', 'center left','upper left'], x_label=x_label, y_labels=y_labels, color_by=color_by,\n", - "plot_period=np.timedelta64(1, 'D'), save_as=save_as, rotation = 60, step=0, gridwidth=[0.2, .2], legend_join='-', title='', \n", - "legend_i=0, title_i=0, only1title=0, onlyntime=0, show=False)\n", + "color_by = \"attribute\"\n", + "ax_digit = \"1.1f\"\n", + "a = my_plot4(\n", + " start_time,\n", + " end_time,\n", + " df0,\n", + " plt_config=plt_config,\n", + " ax_digit=ax_digit,\n", + " group_attr=\"site_id\",\n", + " time_attr=\"t_stamp\",\n", + " color_nights=color_nights,\n", + " cmap=\"plasma\",\n", + " figsize=[16 / 2.54, 1.3],\n", + " same_scale=1,\n", + " fontsize=5,\n", + " fontname=\"DejaVu Sans\",\n", + " plot_total=False,\n", + " plot_total_func=[\"sum\", [lambda x: max(x), \"max\"]],\n", + " num_ticks=num_ticks,\n", + " num_yticks=10,\n", + " dpi=200,\n", + " x_format=\"%H:%M\",\n", + " legend_loc=[\"upper left\", \"upper right\", \"center left\", \"upper left\"],\n", + " x_label=x_label,\n", + " y_labels=y_labels,\n", + " color_by=color_by,\n", + " plot_period=np.timedelta64(1, \"D\"),\n", + " save_as=save_as,\n", + " rotation=60,\n", + " step=0,\n", + " gridwidth=[0.2, 0.2],\n", + " legend_join=\"-\",\n", + " title=\"\",\n", + " legend_i=0,\n", + " title_i=0,\n", + " only1title=0,\n", + " onlyntime=0,\n", + " show=False,\n", + ")\n", "a.do()" ] } diff --git a/SolA2024_Analysis/ghi_pv_estimator_general/val_on_val.ipynb b/SolA2024_Analysis/ghi_pv_estimator_general/val_on_val.ipynb index a0ba058..98604be 100644 --- a/SolA2024_Analysis/ghi_pv_estimator_general/val_on_val.ipynb +++ b/SolA2024_Analysis/ghi_pv_estimator_general/val_on_val.ipynb @@ -7,10 +7,11 @@ "metadata": {}, "outputs": [], "source": [ - "from Data_query.trino_config import *\n", "import numpy as np\n", - "from visualisation import *\n", - "import pytz" + "import pytz\n", + "\n", + "from Data_query.trino_config import *\n", + "from visualisation import *" ] }, { @@ -49,7 +50,7 @@ "big_workers = 1\n", "workers = 0\n", "num_workers = max(workers, big_workers)\n", - "ensure_trino_running(worker_desired_count = workers, big_worker_desired_count=big_workers)\n", + "ensure_trino_running(worker_desired_count=workers, big_worker_desired_count=big_workers)\n", "sleep(40)" ] }, @@ -132,7 +133,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Some sites are excluded because no CS day profile is detected for them. For example, site 1525233041, cs_day is 2024-01-21. But no ts data is detected for that day. " + "# Some sites are excluded because no CS day profile is detected for them. For example, site 1525233041, cs_day is 2024-01-21. But no ts data is detected for that day." ] }, { @@ -298,14 +299,18 @@ "# num_workers = max(workers, big_workers)\n", "# ensure_trino_running(worker_desired_count = workers, big_worker_desired_count=big_workers)\n", "# sleep(20)\n", - "num_parts=7\n", + "num_parts = 7\n", + "\n", + "\n", "def run_func(args):\n", " year, month, split_cons, part = args\n", " # time_filter = f\"year = {year} and month = {month}\"\n", " time_filter = f\"year = {year}\"\n", " # part_filter = f\"postcode % {num_parts} = {part}\"\n", " part_filter = f\"site_id % {num_parts} = {part}\"\n", - " meta_filters = f\"is_pv=True and {split_cons} and flex_export_detected=False and {part_filter}\"\n", + " meta_filters = (\n", + " f\"is_pv=True and {split_cons} and flex_export_detected=False and {part_filter}\"\n", + " )\n", " # meta_filters = f\"is_pv=True and {split_cons} and flex_export_detected=False and site_id in (1669657679,1947677239)\"\n", " df = iceberg_sql(f\"\"\"\n", " with data as \n", @@ -501,24 +506,34 @@ " FROM validation_on_val_data\n", " \"\"\")\n", "\n", - " # \n", + " #\n", "\n", " # sleep(20)\n", " # print(f\"Completed {time_filter}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}, part {part}, count: {df['site_id'].nunique()}\")\n", " return df\n", - "tasks = [(year, month, split_cons, part) for year in (2024, ) for month in range(1, 2) \n", - " for split_cons in [f'system.bucket(postcode, 16) = {i}' for i in range(0, 1)]\n", - " for part in range(0, 1)]\n", - " # for split_cons in ['system.bucket(postcode, 16) > -1'] ]\n", - " \n", - "try: \n", + "\n", + "\n", + "tasks = [\n", + " (year, month, split_cons, part)\n", + " for year in (2024,)\n", + " for month in range(1, 2)\n", + " for split_cons in [f\"system.bucket(postcode, 16) = {i}\" for i in range(0, 1)]\n", + " for part in range(0, 1)\n", + "]\n", + "# for split_cons in ['system.bucket(postcode, 16) > -1'] ]\n", + "\n", + "try:\n", " res_val = trino_parallel(run_func, tasks, num_workers=1)\n", "except Exception as e:\n", " print(f\"Error during data retrieval: {e}\")\n", "finally:\n", " # stop_trino()\n", " pass\n", - "res_val['t_stamp'] = pd.to_datetime(res_val['t_stamp']).dt.tz_localize('utc').dt.tz_convert(pytz.FixedOffset(10*60))\n", + "res_val[\"t_stamp\"] = (\n", + " pd.to_datetime(res_val[\"t_stamp\"])\n", + " .dt.tz_localize(\"utc\")\n", + " .dt.tz_convert(pytz.FixedOffset(10 * 60))\n", + ")\n", "res_val" ] }, @@ -529,7 +544,7 @@ "metadata": {}, "outputs": [], "source": [ - "id_counter=-1" + "id_counter = -1" ] }, { @@ -579,38 +594,73 @@ "source": [ "sample_site_id = 905749026\n", "df0 = res_val.query(f\"site_id=={sample_site_id}\").reset_index(drop=True)\n", - "t0 = df0['t_stamp'].min()\n", - "t1 = df0['t_stamp'].max()\n", + "t0 = df0[\"t_stamp\"].min()\n", + "t1 = df0[\"t_stamp\"].max()\n", "# t1 = t0 + pd.Timedelta(days=6)\n", - "if t0.time() == pd.Timestamp('00:00:00').time():\n", - " start_time = t0.strftime('%Y-%m-%d %H:%M:%S%z')\n", + "if t0.time() == pd.Timestamp(\"00:00:00\").time():\n", + " start_time = t0.strftime(\"%Y-%m-%d %H:%M:%S%z\")\n", "else:\n", - " start_time = (t0 + pd.Timedelta(days=1)).replace(hour=0, minute=0, second=0).strftime('%Y-%m-%d %H:%M:%S%z')\n", - "if t1.time() == pd.Timestamp('00:00:00').time():\n", - " end_time = t1.strftime('%Y-%m-%d %H:%M:%S%z')\n", + " start_time = (\n", + " (t0 + pd.Timedelta(days=1))\n", + " .replace(hour=0, minute=0, second=0)\n", + " .strftime(\"%Y-%m-%d %H:%M:%S%z\")\n", + " )\n", + "if t1.time() == pd.Timestamp(\"00:00:00\").time():\n", + " end_time = t1.strftime(\"%Y-%m-%d %H:%M:%S%z\")\n", "else:\n", - " end_time = t1.replace(hour=0, minute=0, second=0).strftime('%Y-%m-%d %H:%M:%S%z')\n", + " end_time = t1.replace(hour=0, minute=0, second=0).strftime(\"%Y-%m-%d %H:%M:%S%z\")\n", "\n", - "num_ticks = 24*2+1\n", - "save_as = 'Figures/val_mape25.jpeg'\n", - "x_label = 'time'\n", - "y_labels = ['GHI', 'Active Power (kW)', 'Active Power (kW)']\n", + "num_ticks = 24 * 2 + 1\n", + "save_as = \"Figures/val_mape25.jpeg\"\n", + "x_label = \"time\"\n", + "y_labels = [\"GHI\", \"Active Power (kW)\", \"Active Power (kW)\"]\n", "plt_config = {\n", - " 'GHI': [0, 0, '-', None, None],\n", - " 'P_kw_norm': [1, 0, '-', None, None],'P_kw_norm_est': [1, 0, '-', None, None],\n", + " \"GHI\": [0, 0, \"-\", None, None],\n", + " \"P_kw_norm\": [1, 0, \"-\", None, None],\n", + " \"P_kw_norm_est\": [1, 0, \"-\", None, None],\n", "}\n", "\n", - "color_nights=False\n", + "color_nights = False\n", "# color_by = 'group'\n", - "color_by = 'attribute'\n", - "ax_digit = '1.1f'\n", - "a=my_plot4(start_time, end_time, df0, plt_config=plt_config, ax_digit= ax_digit,\n", - " group_attr='site_id', time_attr='t_stamp', color_nights=color_nights,cmap='plasma',\n", - " figsize=[16/2.54,1.3], same_scale=1, fontsize=5, fontname='DejaVu Sans', plot_total=False, plot_total_func=['sum', [lambda x: max(x), 'max']], \n", - " num_ticks=num_ticks, num_yticks=10, dpi=200, x_format= '%H:%M', \n", - " legend_loc=['upper left', 'upper right', 'center left','upper left'], x_label=x_label, y_labels=y_labels, color_by=color_by,\n", - "plot_period=np.timedelta64(1, 'D'), save_as=save_as, rotation = 60, step=0, gridwidth=[0.2, .2], legend_join='-', title='', \n", - "legend_i=0, title_i=0, only1title=0, onlyntime=0, show=False)\n", + "color_by = \"attribute\"\n", + "ax_digit = \"1.1f\"\n", + "a = my_plot4(\n", + " start_time,\n", + " end_time,\n", + " df0,\n", + " plt_config=plt_config,\n", + " ax_digit=ax_digit,\n", + " group_attr=\"site_id\",\n", + " time_attr=\"t_stamp\",\n", + " color_nights=color_nights,\n", + " cmap=\"plasma\",\n", + " figsize=[16 / 2.54, 1.3],\n", + " same_scale=1,\n", + " fontsize=5,\n", + " fontname=\"DejaVu Sans\",\n", + " plot_total=False,\n", + " plot_total_func=[\"sum\", [lambda x: max(x), \"max\"]],\n", + " num_ticks=num_ticks,\n", + " num_yticks=10,\n", + " dpi=200,\n", + " x_format=\"%H:%M\",\n", + " legend_loc=[\"upper left\", \"upper right\", \"center left\", \"upper left\"],\n", + " x_label=x_label,\n", + " y_labels=y_labels,\n", + " color_by=color_by,\n", + " plot_period=np.timedelta64(1, \"D\"),\n", + " save_as=save_as,\n", + " rotation=60,\n", + " step=0,\n", + " gridwidth=[0.2, 0.2],\n", + " legend_join=\"-\",\n", + " title=\"\",\n", + " legend_i=0,\n", + " title_i=0,\n", + " only1title=0,\n", + " onlyntime=0,\n", + " show=False,\n", + ")\n", "a.do()" ] } diff --git a/SolA2024_Analysis/ghi_pv_estimator_general/voltwatt_uncartailedPV.ipynb b/SolA2024_Analysis/ghi_pv_estimator_general/voltwatt_uncartailedPV.ipynb index 22d0bfe..71131db 100644 --- a/SolA2024_Analysis/ghi_pv_estimator_general/voltwatt_uncartailedPV.ipynb +++ b/SolA2024_Analysis/ghi_pv_estimator_general/voltwatt_uncartailedPV.ipynb @@ -7,10 +7,11 @@ "metadata": {}, "outputs": [], "source": [ - "from Data_query.trino_config import *\n", "import numpy as np\n", - "from visualisation import *\n", - "import pytz" + "import pytz\n", + "\n", + "from Data_query.trino_config import *\n", + "from visualisation import *" ] }, { @@ -38,7 +39,7 @@ "metadata": {}, "outputs": [], "source": [ - "sleep(120)\n" + "sleep(120)" ] }, { @@ -59,7 +60,7 @@ "big_workers = 1\n", "workers = 0\n", "num_workers = max(workers, big_workers)\n", - "ensure_trino_running(worker_desired_count = workers, big_worker_desired_count=big_workers)\n", + "ensure_trino_running(worker_desired_count=workers, big_worker_desired_count=big_workers)\n", "# sleep(90)" ] }, @@ -117,10 +118,14 @@ } ], "source": [ - "num_parts=2\n", - "time_bin_interval = '5'\n", + "num_parts = 2\n", + "time_bin_interval = \"5\"\n", "model = \"pv_ghi_norm_model\"\n", - "acceptible_sites = ', '.join(map(str, pd.read_csv('mape<50_sites.csv')['site_id'].tolist()))\n", + "acceptible_sites = \", \".join(\n", + " map(str, pd.read_csv(\"mape<50_sites.csv\")[\"site_id\"].tolist())\n", + ")\n", + "\n", + "\n", "def run_func(args):\n", " year, month, part = args\n", " # time_filter = f\"year = {year} and month = {month}\"\n", @@ -167,16 +172,24 @@ " where P_kw_norm_est is not null\n", " \"\"\")\n", "\n", - " # \n", + " #\n", "\n", " sleep(10)\n", " print(f\"Completed {time_filter}, part {part}\")\n", " return df\n", - "tasks = [(year, month, part) for year in (2024, ) for month in range(1, 2) \n", - " for part in range(0, num_parts)]\n", - " \n", - "try: \n", - " df = trino_parallel_batch(run_func, tasks, num_workers=num_workers, batch_size=num_workers)\n", + "\n", + "\n", + "tasks = [\n", + " (year, month, part)\n", + " for year in (2024,)\n", + " for month in range(1, 2)\n", + " for part in range(0, num_parts)\n", + "]\n", + "\n", + "try:\n", + " df = trino_parallel_batch(\n", + " run_func, tasks, num_workers=num_workers, batch_size=num_workers\n", + " )\n", "except Exception as e:\n", " print(f\"Error during data retrieval: {e}\")\n", "finally:\n", diff --git a/SolA2024_Analysis/sustained_operation.ipynb b/SolA2024_Analysis/sustained_operation.ipynb index 5b9839d..575613a 100644 --- a/SolA2024_Analysis/sustained_operation.ipynb +++ b/SolA2024_Analysis/sustained_operation.ipynb @@ -7,12 +7,14 @@ "metadata": {}, "outputs": [], "source": [ - "from Data_query.trino_config import *\n", "import json\n", - "import numpy as np\n", + "\n", "import matplotlib.pyplot as plt\n", - "from visualisation import *\n", - "import pytz" + "import numpy as np\n", + "import pytz\n", + "\n", + "from Data_query.trino_config import *\n", + "from visualisation import *" ] }, { @@ -24,6 +26,7 @@ "source": [ "sleep(60)\n", "import subprocess as sp\n", + "\n", "sp.run(\"shutdown -h now\", shell=True)" ] }, @@ -42,7 +45,7 @@ } ], "source": [ - "stop_trino()\n" + "stop_trino()" ] }, { @@ -65,8 +68,8 @@ "big_workers = 1\n", "workers = 0\n", "num_workers = max(workers, big_workers)\n", - "ensure_trino_running(worker_desired_count = workers, big_worker_desired_count=big_workers)\n", - "sleep(30)\n" + "ensure_trino_running(worker_desired_count=workers, big_worker_desired_count=big_workers)\n", + "sleep(30)" ] }, { @@ -76,7 +79,7 @@ "metadata": {}, "outputs": [], "source": [ - "df = iceberg_sql('select * from conformance_sust_op')" + "df = iceberg_sql(\"select * from conformance_sust_op\")" ] }, { @@ -291,7 +294,9 @@ } ], "source": [ - "df.query('year ==2024 and month==1 and total_count > 5 and nonconformance_sust_op_count == 0')" + "df.query(\n", + " \"year ==2024 and month==1 and total_count > 5 and nonconformance_sust_op_count == 0\"\n", + ")" ] }, { @@ -1152,16 +1157,28 @@ " where day_night = 'day'\n", " group by site_id, day, day_night\n", " \"\"\")\n", - " # \n", - " \n", - " print(f\"Completed year={year}, month={month}, v_threshold={v_threshold}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\")\n", + " #\n", + "\n", + " print(\n", + " f\"Completed year={year}, month={month}, v_threshold={v_threshold}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\"\n", + " )\n", " return None\n", "\n", - "tasks = [(year, month, v_threshold, split_cons) for year in (2024, 2025) for month in range(1, 13) for v_threshold in range(253, 259) \n", - " for split_cons in ['system.bucket(postcode, 16) <= 3', '(system.bucket(postcode, 16) > 3 and system.bucket(postcode, 16) <= 7)', \n", - " '(system.bucket(postcode, 16) > 7 and system.bucket(postcode, 16) <= 11)', 'system.bucket(postcode, 16) > 11'] ]\n", - " # for split_cons in ['system.bucket(postcode, 16) <= 1'] ]\n", - " \n", + "\n", + "tasks = [\n", + " (year, month, v_threshold, split_cons)\n", + " for year in (2024, 2025)\n", + " for month in range(1, 13)\n", + " for v_threshold in range(253, 259)\n", + " for split_cons in [\n", + " \"system.bucket(postcode, 16) <= 3\",\n", + " \"(system.bucket(postcode, 16) > 3 and system.bucket(postcode, 16) <= 7)\",\n", + " \"(system.bucket(postcode, 16) > 7 and system.bucket(postcode, 16) <= 11)\",\n", + " \"system.bucket(postcode, 16) > 11\",\n", + " ]\n", + "]\n", + "# for split_cons in ['system.bucket(postcode, 16) <= 1'] ]\n", + "\n", "df2 = trino_parallel(run_func, tasks, num_workers=num_workers)" ] }, @@ -1184,6 +1201,8 @@ ], "source": [ "v_threshold = 258\n", + "\n", + "\n", "def run_func(args):\n", " year, month, v_threshold, split_cons = args\n", " df = iceberg_sql(f\"\"\"\n", @@ -1225,16 +1244,27 @@ " where day_night = 'day'\n", " group by site_id, day, day_night\n", " \"\"\")\n", - " # \n", - " \n", - " print(f\"Completed year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\")\n", + " #\n", + "\n", + " print(\n", + " f\"Completed year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\"\n", + " )\n", " return df\n", "\n", - "tasks = [(year, month, split_cons) for year in (2024, ) for month in range(1, 2) \n", - " for split_cons in ['system.bucket(postcode, 16) <= 3', '(system.bucket(postcode, 16) > 3 and system.bucket(postcode, 16) <= 7)', \n", - " '(system.bucket(postcode, 16) > 7 and system.bucket(postcode, 16) <= 11)', 'system.bucket(postcode, 16) > 11'] ]\n", - " # for split_cons in ['system.bucket(postcode, 16) <= 1'] ]\n", - " \n", + "\n", + "tasks = [\n", + " (year, month, split_cons)\n", + " for year in (2024,)\n", + " for month in range(1, 2)\n", + " for split_cons in [\n", + " \"system.bucket(postcode, 16) <= 3\",\n", + " \"(system.bucket(postcode, 16) > 3 and system.bucket(postcode, 16) <= 7)\",\n", + " \"(system.bucket(postcode, 16) > 7 and system.bucket(postcode, 16) <= 11)\",\n", + " \"system.bucket(postcode, 16) > 11\",\n", + " ]\n", + "]\n", + "# for split_cons in ['system.bucket(postcode, 16) <= 1'] ]\n", + "\n", "df2 = trino_parallel(run_func, tasks, num_workers=num_workers)" ] }, @@ -1686,7 +1716,7 @@ } ], "source": [ - "df['site_id'].nunique()" + "df[\"site_id\"].nunique()" ] }, { diff --git a/SolA2024_Analysis/sustained_operation_3w.ipynb b/SolA2024_Analysis/sustained_operation_3w.ipynb index b172942..4f7cb85 100644 --- a/SolA2024_Analysis/sustained_operation_3w.ipynb +++ b/SolA2024_Analysis/sustained_operation_3w.ipynb @@ -7,12 +7,14 @@ "metadata": {}, "outputs": [], "source": [ - "from Data_query.trino_config import *\n", "import json\n", - "import numpy as np\n", + "\n", "import matplotlib.pyplot as plt\n", - "from visualisation import *\n", - "import pytz" + "import numpy as np\n", + "import pytz\n", + "\n", + "from Data_query.trino_config import *\n", + "from visualisation import *" ] }, { @@ -24,6 +26,7 @@ "source": [ "sleep(60)\n", "import subprocess as sp\n", + "\n", "sp.run(\"shutdown -h now\", shell=True)" ] }, @@ -42,7 +45,7 @@ } ], "source": [ - "stop_trino()\n" + "stop_trino()" ] }, { @@ -52,7 +55,7 @@ "metadata": {}, "outputs": [], "source": [ - "sleep(120)\n" + "sleep(120)" ] }, { @@ -75,8 +78,8 @@ "big_workers = 6\n", "workers = 0\n", "num_workers = max(workers, big_workers)\n", - "ensure_trino_running(worker_desired_count = workers, big_worker_desired_count=big_workers)\n", - "sleep(30)\n" + "ensure_trino_running(worker_desired_count=workers, big_worker_desired_count=big_workers)\n", + "sleep(30)" ] }, { @@ -86,7 +89,7 @@ "metadata": {}, "outputs": [], "source": [ - "df = iceberg_sql('select * from conformance_sust_op')" + "df = iceberg_sql(\"select * from conformance_sust_op\")" ] }, { @@ -301,7 +304,9 @@ } ], "source": [ - "df.query('year ==2024 and month==1 and total_count > 5 and nonconformance_sust_op_count == 0')" + "df.query(\n", + " \"year ==2024 and month==1 and total_count > 5 and nonconformance_sust_op_count == 0\"\n", + ")" ] }, { @@ -1545,16 +1550,28 @@ " where day_night = 'day'\n", " group by site_id, day, day_night\n", " \"\"\")\n", - " # \n", + " #\n", " sleep(20)\n", - " print(f\"Completed year={year}, month={month}, v_threshold={v_threshold}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\")\n", + " print(\n", + " f\"Completed year={year}, month={month}, v_threshold={v_threshold}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\"\n", + " )\n", " return None\n", "\n", - "tasks = [(year, month, v_threshold, split_cons) for year in (2024, 2025) for month in range(1, 13) for v_threshold in range(253, 259) \n", - " for split_cons in ['system.bucket(postcode, 16) <= 3', '(system.bucket(postcode, 16) > 3 and system.bucket(postcode, 16) <= 7)', \n", - " '(system.bucket(postcode, 16) > 7 and system.bucket(postcode, 16) <= 11)', 'system.bucket(postcode, 16) > 11'] ]\n", - " # for split_cons in ['system.bucket(postcode, 16) <= 1'] ]\n", - " \n", + "\n", + "tasks = [\n", + " (year, month, v_threshold, split_cons)\n", + " for year in (2024, 2025)\n", + " for month in range(1, 13)\n", + " for v_threshold in range(253, 259)\n", + " for split_cons in [\n", + " \"system.bucket(postcode, 16) <= 3\",\n", + " \"(system.bucket(postcode, 16) > 3 and system.bucket(postcode, 16) <= 7)\",\n", + " \"(system.bucket(postcode, 16) > 7 and system.bucket(postcode, 16) <= 11)\",\n", + " \"system.bucket(postcode, 16) > 11\",\n", + " ]\n", + "]\n", + "# for split_cons in ['system.bucket(postcode, 16) <= 1'] ]\n", + "\n", "df2 = trino_parallel(run_func, tasks, num_workers=num_workers)" ] }, @@ -1577,6 +1594,8 @@ ], "source": [ "v_threshold = 258\n", + "\n", + "\n", "def run_func(args):\n", " year, month, v_threshold, split_cons = args\n", " df = iceberg_sql(f\"\"\"\n", @@ -1618,16 +1637,27 @@ " where day_night = 'day'\n", " group by site_id, day, day_night\n", " \"\"\")\n", - " # \n", - " \n", - " print(f\"Completed year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\")\n", + " #\n", + "\n", + " print(\n", + " f\"Completed year={year}, month={month}, {split_cons.replace('system.bucket(postcode, 16)', 'bucket')}\"\n", + " )\n", " return df\n", "\n", - "tasks = [(year, month, split_cons) for year in (2024, ) for month in range(1, 2) \n", - " for split_cons in ['system.bucket(postcode, 16) <= 3', '(system.bucket(postcode, 16) > 3 and system.bucket(postcode, 16) <= 7)', \n", - " '(system.bucket(postcode, 16) > 7 and system.bucket(postcode, 16) <= 11)', 'system.bucket(postcode, 16) > 11'] ]\n", - " # for split_cons in ['system.bucket(postcode, 16) <= 1'] ]\n", - " \n", + "\n", + "tasks = [\n", + " (year, month, split_cons)\n", + " for year in (2024,)\n", + " for month in range(1, 2)\n", + " for split_cons in [\n", + " \"system.bucket(postcode, 16) <= 3\",\n", + " \"(system.bucket(postcode, 16) > 3 and system.bucket(postcode, 16) <= 7)\",\n", + " \"(system.bucket(postcode, 16) > 7 and system.bucket(postcode, 16) <= 11)\",\n", + " \"system.bucket(postcode, 16) > 11\",\n", + " ]\n", + "]\n", + "# for split_cons in ['system.bucket(postcode, 16) <= 1'] ]\n", + "\n", "df2 = trino_parallel(run_func, tasks, num_workers=num_workers)" ] }, @@ -2079,7 +2109,7 @@ } ], "source": [ - "df['site_id'].nunique()" + "df[\"site_id\"].nunique()" ] }, { diff --git a/darth/config.json b/darth/config.json index deca1bb..214feaa 100644 --- a/darth/config.json +++ b/darth/config.json @@ -8,4 +8,4 @@ "darth_password": "hS*8glEv56!#sU", "ssh_private_key_path": "/home/ubuntu/.ssh/id_rsa", "ssh_private_key_password":"" -} \ No newline at end of file +} diff --git a/darth/darth_functions.py b/darth/darth_functions.py index a1fb67d..6b9a256 100644 --- a/darth/darth_functions.py +++ b/darth/darth_functions.py @@ -1,21 +1,23 @@ +import json +import os +import urllib.parse + import pandas as pd import psycopg2 -from sshtunnel import SSHTunnelForwarder -from sqlalchemy import create_engine -import urllib.parse -import os, json import yaml +from sqlalchemy import create_engine +from sshtunnel import SSHTunnelForwarder + def get_darth_data(config, sql_query): with SSHTunnelForwarder( - ssh_address_or_host=(config["hostserver"], 22), - ssh_username=config["ssh_username"], - ssh_pkey=config["ssh_private_key_path"], - ssh_private_key_password=config["ssh_private_key_password"], - host_pkey_directories=[], - remote_bind_address=(config["remote_host"], config["remote_port"]) + ssh_address_or_host=(config["hostserver"], 22), + ssh_username=config["ssh_username"], + ssh_pkey=config["ssh_private_key_path"], + ssh_private_key_password=config["ssh_private_key_password"], + host_pkey_directories=[], + remote_bind_address=(config["remote_host"], config["remote_port"]), ) as tunnel: - tunnel.start() local_port = str(tunnel.local_bind_port) @@ -23,12 +25,14 @@ def get_darth_data(config, sql_query): # Needed to handle special characters in password i.e. "@" darth_password = urllib.parse.quote_plus(config["darth_password"]) - engine_str = 'postgresql+psycopg2://{user}:{password}@{host}:{port}/{db}'.format( - user=config["darth_username"], - password=darth_password, - host='localhost', - port=local_port, - db=config["databasename"] + engine_str = ( + "postgresql+psycopg2://{user}:{password}@{host}:{port}/{db}".format( + user=config["darth_username"], + password=darth_password, + host="localhost", + port=local_port, + db=config["databasename"], + ) ) engine = create_engine(engine_str) diff --git a/darth/test.ipynb b/darth/test.ipynb index 0537d55..6e4d8f4 100644 --- a/darth/test.ipynb +++ b/darth/test.ipynb @@ -17,10 +17,13 @@ } ], "source": [ - "from darth.darth_functions import *\n", "from sklearn.neighbors import KDTree\n", + "\n", + "from darth.darth_functions import *\n", "from visualisation import *\n", - "with open(\"darth/config.json\", \"r\") as file: config= json.load(file)\n" + "\n", + "with open(\"darth/config.json\", \"r\") as file:\n", + " config = json.load(file)" ] }, { @@ -45,7 +48,7 @@ "# bom_data_vic_v\n", "# bom_data_wa_v\n", "# bom_station_details_v\n", - "tablename = 'bom_station_details_v'" + "tablename = \"bom_station_details_v\"" ] }, { @@ -54,7 +57,7 @@ "metadata": {}, "outputs": [], "source": [ - "site_locations = pd.read_csv('tests/darth/site_locations.csv')" + "site_locations = pd.read_csv(\"tests/darth/site_locations.csv\")" ] }, { @@ -71,13 +74,15 @@ } ], "source": [ + "import json\n", + "import os\n", + "import urllib.parse\n", + "\n", "import pandas as pd\n", "import psycopg2\n", - "from sshtunnel import SSHTunnelForwarder\n", + "import yaml\n", "from sqlalchemy import create_engine\n", - "import urllib.parse\n", - "import os, json\n", - "import yaml\n" + "from sshtunnel import SSHTunnelForwarder" ] }, { @@ -108,17 +113,16 @@ } ], "source": [ - "\n", "with SSHTunnelForwarder(\n", - " ssh_address_or_host=(config[\"hostserver\"], 22),\n", - " ssh_username=config[\"ssh_username\"],\n", - " ssh_pkey=config[\"ssh_private_key_path\"],\n", - " ssh_private_key_password=config[\"ssh_private_key_password\"],\n", - " host_pkey_directories=[],\n", - " remote_bind_address=(config[\"remote_host\"], config[\"remote_port\"])\n", - " ) as tunnel:\n", + " ssh_address_or_host=(config[\"hostserver\"], 22),\n", + " ssh_username=config[\"ssh_username\"],\n", + " ssh_pkey=config[\"ssh_private_key_path\"],\n", + " ssh_private_key_password=config[\"ssh_private_key_password\"],\n", + " host_pkey_directories=[],\n", + " remote_bind_address=(config[\"remote_host\"], config[\"remote_port\"]),\n", + ") as tunnel:\n", " # tunnel.start()\n", - " print('hi')\n", + " print(\"hi\")\n", " # local_port = str(tunnel.local_bind_port)\n", "\n", " # # Needed to handle special characters in password i.e. \"@\"\n", @@ -368,15 +372,18 @@ "metadata": {}, "outputs": [], "source": [ - "\n", - "station_coords = station_locations[['latitude', 'longitude']].to_numpy()\n", - "site_coords = site_locations[['latitude', 'longitude']].to_numpy()\n", - "kdtree = KDTree(station_coords, metric='euclidean')\n", + "station_coords = station_locations[[\"latitude\", \"longitude\"]].to_numpy()\n", + "site_coords = site_locations[[\"latitude\", \"longitude\"]].to_numpy()\n", + "kdtree = KDTree(station_coords, metric=\"euclidean\")\n", "distances, indices = kdtree.query(site_coords, k=1) # k=1 → nearest\n", "nearest_indices = indices.flatten()\n", "nearest_distances = distances.flatten()\n", - "site_locations[['n_long', 'n_lat', 'station_number']] = station_locations.iloc[nearest_indices][['longitude', 'latitude', 'station_number']].values\n", - "site_locations['distance_km'] = nearest_distances*111 # Rough conversion factor for degrees to kilometers" + "site_locations[[\"n_long\", \"n_lat\", \"station_number\"]] = station_locations.iloc[\n", + " nearest_indices\n", + "][[\"longitude\", \"latitude\", \"station_number\"]].values\n", + "site_locations[\"distance_km\"] = (\n", + " nearest_distances * 111\n", + ") # Rough conversion factor for degrees to kilometers" ] }, { @@ -444,7 +451,7 @@ } ], "source": [ - "site_locations.query('site_id == 1299741610')" + "site_locations.query(\"site_id == 1299741610\")" ] }, { @@ -464,7 +471,7 @@ } ], "source": [ - "','.join(site_locations['station_number'].unique())" + "\",\".join(site_locations[\"station_number\"].unique())" ] }, { @@ -475,14 +482,14 @@ "source": [ "# {','.join(site_locations['station_number'].unique())}\n", "df_list = []\n", - "state = ['ant', 'nsw', 'vic', 'tas', 'qld', 'sa', 'nt', 'wa']\n", - "offset = [ 10, 10, 10, 10, 10, 9.5, 9.5, 8 ]\n", + "state = [\"ant\", \"nsw\", \"vic\", \"tas\", \"qld\", \"sa\", \"nt\", \"wa\"]\n", + "offset = [10, 10, 10, 10, 10, 9.5, 9.5, 8]\n", "# state = ['nsw']\n", "# offset = [ 10 ]\n", "for s, o in zip(state, offset):\n", " query = f\"\"\"select station_number, date_trunc('day', datetime) as day, max(air_temperature_in_degrees_c) as temp\n", " from bom_data_{s}_v\n", - " where station_number in ({','.join(site_locations['station_number'].unique())}) and year=2024 and extract(hour from datetime) in (10, 11, 12, 13, 14)\n", + " where station_number in ({\",\".join(site_locations[\"station_number\"].unique())}) and year=2024 and extract(hour from datetime) in (10, 11, 12, 13, 14)\n", " group by station_number, date_trunc('day', datetime)\n", " order by day asc\n", " \"\"\"\n", @@ -618,7 +625,7 @@ } ], "source": [ - "df[df['station_number']==67119]" + "df[df[\"station_number\"] == 67119]" ] }, { @@ -627,7 +634,7 @@ "metadata": {}, "outputs": [], "source": [ - "df.to_parquet('tests/darth/station_temp_2024.parquet', index=False)" + "df.to_parquet(\"tests/darth/station_temp_2024.parquet\", index=False)" ] }, { @@ -636,7 +643,9 @@ "metadata": {}, "outputs": [], "source": [ - "site_locations[['site_id', 'station_number']].to_parquet('tests/darth/site_station_mapping.parquet', index=False)" + "site_locations[[\"site_id\", \"station_number\"]].to_parquet(\n", + " \"tests/darth/site_station_mapping.parquet\", index=False\n", + ")" ] }, { @@ -657,29 +666,57 @@ } ], "source": [ - "start_time = '2024-01-01 00:00:00+11:00' # In sydney local time\n", - "end_time = '2024-02-01 00:00:00+11:00' # In sydney local time\n", + "start_time = \"2024-01-01 00:00:00+11:00\" # In sydney local time\n", + "end_time = \"2024-02-01 00:00:00+11:00\" # In sydney local time\n", "\n", - "num_ticks = 24*2+1\n", + "num_ticks = 24 * 2 + 1\n", "# save_as = 'Figures/EDP_voltwatt_12Nov.jpeg'\n", - "save_as = ''\n", - "x_label = 'time'\n", - "y_labels = ['temp (^oC)']\n", + "save_as = \"\"\n", + "x_label = \"time\"\n", + "y_labels = [\"temp (^oC)\"]\n", "\n", - "plt_config = {'temp': [0, 0, '-', None, None]\n", - "}\n", + "plt_config = {\"temp\": [0, 0, \"-\", None, None]}\n", "\n", - "color_nights=False\n", + "color_nights = False\n", "# color_by = 'group'\n", - "color_by = 'attribute'\n", - "ax_digit = '1.1f'\n", - "a=my_plot4(start_time, end_time, df, plt_config=plt_config, ax_digit= ax_digit,\n", - " group_attr='station_number', time_attr='time', color_nights=color_nights,cmap='viridis',\n", - " figsize=[12/2.54,1.5], same_scale=1, fontsize=7, fontname='Times New Roman', plot_total=False, plot_total_func=['sum', [lambda x: max(x), 'max']], \n", - " num_ticks=num_ticks, num_yticks=10, dpi=300, x_format= '%H:%M', \n", - " legend_loc=['upper left'], \n", - " x_label=x_label, y_labels=y_labels, color_by=color_by,\n", - "plot_period=np.timedelta64(1, 'D'), save_as=save_as, rotation = 60, step=0, gridwidth=[0.2, .2], legend_join='-', title='', legend_i=0, legend_j=None, title_i=0, only1title=0)\n", + "color_by = \"attribute\"\n", + "ax_digit = \"1.1f\"\n", + "a = my_plot4(\n", + " start_time,\n", + " end_time,\n", + " df,\n", + " plt_config=plt_config,\n", + " ax_digit=ax_digit,\n", + " group_attr=\"station_number\",\n", + " time_attr=\"time\",\n", + " color_nights=color_nights,\n", + " cmap=\"viridis\",\n", + " figsize=[12 / 2.54, 1.5],\n", + " same_scale=1,\n", + " fontsize=7,\n", + " fontname=\"Times New Roman\",\n", + " plot_total=False,\n", + " plot_total_func=[\"sum\", [lambda x: max(x), \"max\"]],\n", + " num_ticks=num_ticks,\n", + " num_yticks=10,\n", + " dpi=300,\n", + " x_format=\"%H:%M\",\n", + " legend_loc=[\"upper left\"],\n", + " x_label=x_label,\n", + " y_labels=y_labels,\n", + " color_by=color_by,\n", + " plot_period=np.timedelta64(1, \"D\"),\n", + " save_as=save_as,\n", + " rotation=60,\n", + " step=0,\n", + " gridwidth=[0.2, 0.2],\n", + " legend_join=\"-\",\n", + " title=\"\",\n", + " legend_i=0,\n", + " legend_j=None,\n", + " title_i=0,\n", + " only1title=0,\n", + ")\n", "a.do()" ] }, @@ -700,7 +737,7 @@ } ], "source": [ - "os.path.exists(config['ssh_private_key_path'])" + "os.path.exists(config[\"ssh_private_key_path\"])" ] }, { @@ -710,10 +747,10 @@ "outputs": [], "source": [ "edp_path = \"/home/hossein/CICCADA/tests/4) Data/EDP SA 2023 Data\"\n", - "meta_data1= pd.read_csv(edp_path+\"/edp_sites_metadata_sa_postcode.csv\")\n", - "meta_data2 = pd.read_csv(edp_path+\"/edp_sites_metadata59239829.csv\")\n", - "meta_data2 = meta_data2[meta_data2['state'] == 'SA']\n", - "SA_site_ids = meta_data2['edp_site_id'].unique()" + "meta_data1 = pd.read_csv(edp_path + \"/edp_sites_metadata_sa_postcode.csv\")\n", + "meta_data2 = pd.read_csv(edp_path + \"/edp_sites_metadata59239829.csv\")\n", + "meta_data2 = meta_data2[meta_data2[\"state\"] == \"SA\"]\n", + "SA_site_ids = meta_data2[\"edp_site_id\"].unique()" ] }, { diff --git a/tests/EDP_12Nov2022.ipynb b/tests/EDP_12Nov2022.ipynb index 458560d..c53b135 100755 --- a/tests/EDP_12Nov2022.ipynb +++ b/tests/EDP_12Nov2022.ipynb @@ -10,9 +10,11 @@ "%reload_ext autoreload\n", "%autoreload 2\n", "import sys\n", - "sys.path.append('../') \n", - "from visualisation import *\n", - "from epd_process_funcs import *" + "\n", + "sys.path.append(\"../\")\n", + "from epd_process_funcs import *\n", + "\n", + "from visualisation import *" ] }, { @@ -22,12 +24,15 @@ "metadata": {}, "outputs": [], "source": [ - "solar=pd.read_csv('/home/hossein/CICCADA/BOM_NCI/2022/NCI_processed_grouped_Nov.csv')\n", - "solar['time'] = pd.to_datetime(solar['time'])\n", - "solar['time'] = solar['time'].dt.tz_localize('utc')\n", - "solar['time'] = solar['time'].dt.tz_convert(pytz.FixedOffset(9.5*60))\n", - "solar['postcode'] = solar['postcode'].astype(int)\n", - "solar.rename(columns={'surface_global_irradiance': 'GHI', 'direct_normal_irradiance': 'DNI'}, inplace=True)" + "solar = pd.read_csv(\"/home/hossein/CICCADA/BOM_NCI/2022/NCI_processed_grouped_Nov.csv\")\n", + "solar[\"time\"] = pd.to_datetime(solar[\"time\"])\n", + "solar[\"time\"] = solar[\"time\"].dt.tz_localize(\"utc\")\n", + "solar[\"time\"] = solar[\"time\"].dt.tz_convert(pytz.FixedOffset(9.5 * 60))\n", + "solar[\"postcode\"] = solar[\"postcode\"].astype(int)\n", + "solar.rename(\n", + " columns={\"surface_global_irradiance\": \"GHI\", \"direct_normal_irradiance\": \"DNI\"},\n", + " inplace=True,\n", + ")" ] }, { @@ -48,7 +53,7 @@ } ], "source": [ - "5068 in solar['postcode'].unique()" + "5068 in solar[\"postcode\"].unique()" ] }, { @@ -59,7 +64,7 @@ "outputs": [], "source": [ "edp_path = \"4) Data/EDP SA 2023 Data\"\n", - "df = pd.read_csv('edp_data_2022_11_v.csv')" + "df = pd.read_csv(\"edp_data_2022_11_v.csv\")" ] }, { @@ -69,13 +74,26 @@ "metadata": {}, "outputs": [], "source": [ - "meta_data1= pd.read_csv(edp_path+\"/edp_sites_metadata_sa_postcode.csv\")\n", - "meta_data2 = pd.read_csv(edp_path+\"/edp_sites_metadata59239829.csv\")\n", - "meta_data2 = meta_data2[meta_data2['state'] == 'SA']\n", - "meta_data3 = meta_data2.merge(meta_data1[['edp_site_id', 'postcode']], on='edp_site_id', how='left')\n", - "meta_data3['Srated'] = meta_data3['inverter_ac_rating_kw']*meta_data3['inverter_count']\n", - "meta_data3['Srated'] = meta_data3.apply(lambda row: row['inverter_ac_rating_kw'] if pd.isna(row['inverter_count']) else row['Srated'], axis=1)\n", - "meta_data2 = meta_data3.groupby(['edp_site_id', 'postcode']).agg({'Srated':'sum'}).reset_index()" + "meta_data1 = pd.read_csv(edp_path + \"/edp_sites_metadata_sa_postcode.csv\")\n", + "meta_data2 = pd.read_csv(edp_path + \"/edp_sites_metadata59239829.csv\")\n", + "meta_data2 = meta_data2[meta_data2[\"state\"] == \"SA\"]\n", + "meta_data3 = meta_data2.merge(\n", + " meta_data1[[\"edp_site_id\", \"postcode\"]], on=\"edp_site_id\", how=\"left\"\n", + ")\n", + "meta_data3[\"Srated\"] = (\n", + " meta_data3[\"inverter_ac_rating_kw\"] * meta_data3[\"inverter_count\"]\n", + ")\n", + "meta_data3[\"Srated\"] = meta_data3.apply(\n", + " lambda row: (\n", + " row[\"inverter_ac_rating_kw\"]\n", + " if pd.isna(row[\"inverter_count\"])\n", + " else row[\"Srated\"]\n", + " ),\n", + " axis=1,\n", + ")\n", + "meta_data2 = (\n", + " meta_data3.groupby([\"edp_site_id\", \"postcode\"]).agg({\"Srated\": \"sum\"}).reset_index()\n", + ")" ] }, { @@ -85,7 +103,7 @@ "metadata": {}, "outputs": [], "source": [ - "df5 = process_edp(df, meta_data2, 10.5*60)\n" + "df5 = process_edp(df, meta_data2, 10.5 * 60)" ] }, { @@ -95,7 +113,7 @@ "metadata": {}, "outputs": [], "source": [ - "ii=0" + "ii = 0" ] }, { @@ -138,39 +156,89 @@ ], "source": [ "df8 = df5.copy()\n", - "df8 = df8[df8['edp_site_id'].isin(df8['edp_site_id'].unique()[ii:ii+1])].reset_index(drop=True)\n", - "df8 = df8.merge(solar, on=['time', 'postcode'], how='left')\n", - "df8 = df8.sort_values(['postcode', 'time']).groupby('postcode').apply(lambda group: group.set_index('time').interpolate(method='time', limit=1).reset_index()).reset_index(drop=True)\n", - "start_time = '2022-11-15 00:00:00+09:30' # In sydney local time\n", - "end_time = '2022-11-18 00:00:00+09:30' # In sydney local time\n", - "ii+=1\n", + "df8 = df8[\n", + " df8[\"edp_site_id\"].isin(df8[\"edp_site_id\"].unique()[ii : ii + 1])\n", + "].reset_index(drop=True)\n", + "df8 = df8.merge(solar, on=[\"time\", \"postcode\"], how=\"left\")\n", + "df8 = (\n", + " df8.sort_values([\"postcode\", \"time\"])\n", + " .groupby(\"postcode\")\n", + " .apply(\n", + " lambda group: (\n", + " group.set_index(\"time\").interpolate(method=\"time\", limit=1).reset_index()\n", + " )\n", + " )\n", + " .reset_index(drop=True)\n", + ")\n", + "start_time = \"2022-11-15 00:00:00+09:30\" # In sydney local time\n", + "end_time = \"2022-11-18 00:00:00+09:30\" # In sydney local time\n", + "ii += 1\n", "\n", - "num_ticks = 24*2+1\n", + "num_ticks = 24 * 2 + 1\n", "# save_as = 'Figures/EDP_voltwatt_12Nov.jpeg'\n", - "save_as = ''\n", - "x_label = 'time'\n", - "y_labels = ['GHI', 'Cloud', \n", - " 'Active power (kW)', 'Active power (kW)',\n", - " 'Reactive power (kvar)', 'Reactive power (kvar)','Reactive power (kvar)', 'voltage (V)']\n", + "save_as = \"\"\n", + "x_label = \"time\"\n", + "y_labels = [\n", + " \"GHI\",\n", + " \"Cloud\",\n", + " \"Active power (kW)\",\n", + " \"Active power (kW)\",\n", + " \"Reactive power (kvar)\",\n", + " \"Reactive power (kvar)\",\n", + " \"Reactive power (kvar)\",\n", + " \"voltage (V)\",\n", + "]\n", "\n", - "plt_config = {'GHI': [0, 0, '-', None, None],\n", - " 'cloud_type': [0, 1, '-', None, None],\n", - " 'P_threshold': [1, 0, '-.', None, None], 'active_power': [1, 0, '-', None, None],\n", - "'Q_voltvar_max': [2, 0, '-.', None, None], 'Q_voltvar_min': [2, 0, '-.', None, None], 'reactive_power': [2, 0, '-', None, None],\n", - "'voltage_avg': [1, 1, '-', None, None]\n", + "plt_config = {\n", + " \"GHI\": [0, 0, \"-\", None, None],\n", + " \"cloud_type\": [0, 1, \"-\", None, None],\n", + " \"P_threshold\": [1, 0, \"-.\", None, None],\n", + " \"active_power\": [1, 0, \"-\", None, None],\n", + " \"Q_voltvar_max\": [2, 0, \"-.\", None, None],\n", + " \"Q_voltvar_min\": [2, 0, \"-.\", None, None],\n", + " \"reactive_power\": [2, 0, \"-\", None, None],\n", + " \"voltage_avg\": [1, 1, \"-\", None, None],\n", "}\n", "\n", - "color_nights=False\n", + "color_nights = False\n", "# color_by = 'group'\n", - "color_by = 'attribute'\n", - "ax_digit = '1.1f'\n", - "a=my_plot4(start_time, end_time, df8, plt_config=plt_config, ax_digit= ax_digit,\n", - " group_attr='edp_site_id', time_attr='time', color_nights=color_nights,cmap='viridis',\n", - " figsize=[14/2.54,1.5], same_scale=1, fontsize=7, fontname='Times New Roman', plot_total=False, plot_total_func=['sum', [lambda x: max(x), 'max']], \n", - " num_ticks=num_ticks, num_yticks=10, dpi=300, x_format= '%H:%M', \n", - " legend_loc=['upper left', 'upper right', 'upper left', 'upper right', 'lower left'], \n", - " x_label=x_label, y_labels=y_labels, color_by=color_by,\n", - "plot_period=np.timedelta64(1, 'D'), save_as=save_as, rotation = 60, step=0, gridwidth=[0.2, .2], legend_join='-', title='', legend_i=0, title_i=0, only1title=0)\n", + "color_by = \"attribute\"\n", + "ax_digit = \"1.1f\"\n", + "a = my_plot4(\n", + " start_time,\n", + " end_time,\n", + " df8,\n", + " plt_config=plt_config,\n", + " ax_digit=ax_digit,\n", + " group_attr=\"edp_site_id\",\n", + " time_attr=\"time\",\n", + " color_nights=color_nights,\n", + " cmap=\"viridis\",\n", + " figsize=[14 / 2.54, 1.5],\n", + " same_scale=1,\n", + " fontsize=7,\n", + " fontname=\"Times New Roman\",\n", + " plot_total=False,\n", + " plot_total_func=[\"sum\", [lambda x: max(x), \"max\"]],\n", + " num_ticks=num_ticks,\n", + " num_yticks=10,\n", + " dpi=300,\n", + " x_format=\"%H:%M\",\n", + " legend_loc=[\"upper left\", \"upper right\", \"upper left\", \"upper right\", \"lower left\"],\n", + " x_label=x_label,\n", + " y_labels=y_labels,\n", + " color_by=color_by,\n", + " plot_period=np.timedelta64(1, \"D\"),\n", + " save_as=save_as,\n", + " rotation=60,\n", + " step=0,\n", + " gridwidth=[0.2, 0.2],\n", + " legend_join=\"-\",\n", + " title=\"\",\n", + " legend_i=0,\n", + " title_i=0,\n", + " only1title=0,\n", + ")\n", "a.do()" ] }, @@ -181,7 +249,7 @@ "metadata": {}, "outputs": [], "source": [ - "ii=0" + "ii = 0" ] }, { @@ -213,8 +281,8 @@ ], "source": [ "df8 = df5.query(f\"P_noncomp > 0 and voltage_avg > 250\").reset_index(drop=True)\n", - "site_ids = df8['edp_site_id'].unique()[0]\n", - "time_t = df8.query(f\"edp_site_id == '{site_ids}'\")['time'].to_list()[ii]\n", + "site_ids = df8[\"edp_site_id\"].unique()[0]\n", + "time_t = df8.query(f\"edp_site_id == '{site_ids}'\")[\"time\"].to_list()[ii]\n", "# ii+=1\n", "t0 = time_t.replace(hour=0, minute=0, second=0, microsecond=0)\n", "t1 = t0 + pd.Timedelta(days=1)\n", @@ -222,28 +290,60 @@ "print(f\"Time: {time_t}\")\n", "print(f\"site_id: {site_ids}, postcode: {df8['postcode'].unique()[0]}\")\n", "df8 = df5.copy()\n", - "df8 = df8[df8['edp_site_id'] == site_ids].reset_index(drop=True)\n", - "start_time = t0 # In sydney local time\n", - "end_time = t1 # In sydney local time\n", + "df8 = df8[df8[\"edp_site_id\"] == site_ids].reset_index(drop=True)\n", + "start_time = t0 # In sydney local time\n", + "end_time = t1 # In sydney local time\n", "\n", "\n", - "num_ticks = 24*2+1\n", - "save_as = 'Figures/EDP_voltwatt_19Nov.jpeg'\n", - "x_label = 'time'\n", - "y_labels = ['Active power (kW)', 'Active power (kW)', 'voltage (V)']\n", - "plt_config = {'P_threshold': [0, 0, '-.', None, None], 'active_power': [0, 0, '-', None, None],\n", - "'voltage_avg': [0, 1, '-', None, None]}\n", + "num_ticks = 24 * 2 + 1\n", + "save_as = \"Figures/EDP_voltwatt_19Nov.jpeg\"\n", + "x_label = \"time\"\n", + "y_labels = [\"Active power (kW)\", \"Active power (kW)\", \"voltage (V)\"]\n", + "plt_config = {\n", + " \"P_threshold\": [0, 0, \"-.\", None, None],\n", + " \"active_power\": [0, 0, \"-\", None, None],\n", + " \"voltage_avg\": [0, 1, \"-\", None, None],\n", + "}\n", "\n", - "color_nights=False\n", + "color_nights = False\n", "# color_by = 'group'\n", - "color_by = 'attribute'\n", - "ax_digit = '1.1f'\n", - "a=my_plot4(start_time, end_time, df8, plt_config=plt_config, ax_digit= ax_digit,\n", - " group_attr='edp_site_id', time_attr='time', color_nights=color_nights,cmap='viridis',\n", - " figsize=[14/2.54,2], same_scale=1, fontsize=7, fontname='Times New Roman', plot_total=False, plot_total_func=['sum', [lambda x: max(x), 'max']], \n", - " num_ticks=num_ticks, num_yticks=10, dpi=300, x_format= '%H:%M', \n", - " legend_loc=['upper left', 'upper right', 'upper right'], x_label=x_label, y_labels=y_labels, color_by=color_by,\n", - "plot_period=np.timedelta64(1, 'D'), save_as=save_as, rotation = 60, step=0, gridwidth=[0.2, .2], legend_join='-', title='', legend_i=0, title_i=0, only1title=0)\n", + "color_by = \"attribute\"\n", + "ax_digit = \"1.1f\"\n", + "a = my_plot4(\n", + " start_time,\n", + " end_time,\n", + " df8,\n", + " plt_config=plt_config,\n", + " ax_digit=ax_digit,\n", + " group_attr=\"edp_site_id\",\n", + " time_attr=\"time\",\n", + " color_nights=color_nights,\n", + " cmap=\"viridis\",\n", + " figsize=[14 / 2.54, 2],\n", + " same_scale=1,\n", + " fontsize=7,\n", + " fontname=\"Times New Roman\",\n", + " plot_total=False,\n", + " plot_total_func=[\"sum\", [lambda x: max(x), \"max\"]],\n", + " num_ticks=num_ticks,\n", + " num_yticks=10,\n", + " dpi=300,\n", + " x_format=\"%H:%M\",\n", + " legend_loc=[\"upper left\", \"upper right\", \"upper right\"],\n", + " x_label=x_label,\n", + " y_labels=y_labels,\n", + " color_by=color_by,\n", + " plot_period=np.timedelta64(1, \"D\"),\n", + " save_as=save_as,\n", + " rotation=60,\n", + " step=0,\n", + " gridwidth=[0.2, 0.2],\n", + " legend_join=\"-\",\n", + " title=\"\",\n", + " legend_i=0,\n", + " title_i=0,\n", + " only1title=0,\n", + ")\n", "a.do()" ] }, @@ -254,7 +354,7 @@ "metadata": {}, "outputs": [], "source": [ - "ii=0" + "ii = 0" ] }, { @@ -285,38 +385,78 @@ } ], "source": [ - "df8 = df5.query(f\"Q_noncomp > .2 and voltage_avg > 250 and reactive_power < -.5\").reset_index(drop=True)\n", - "site_ids = df8['edp_site_id'].unique()[0]\n", - "time_t = df8.query(f\"edp_site_id == '{site_ids}'\")['time'].to_list()[ii]\n", - "ii+=1\n", + "df8 = df5.query(\n", + " f\"Q_noncomp > .2 and voltage_avg > 250 and reactive_power < -.5\"\n", + ").reset_index(drop=True)\n", + "site_ids = df8[\"edp_site_id\"].unique()[0]\n", + "time_t = df8.query(f\"edp_site_id == '{site_ids}'\")[\"time\"].to_list()[ii]\n", + "ii += 1\n", "t0 = time_t.replace(hour=0, minute=0, second=0, microsecond=0)\n", "t1 = t0 + pd.Timedelta(days=1)\n", "print(f\"Volt-var non-compliance\")\n", "print(f\"Time: {time_t}\")\n", "print(f\"site_id: {site_ids}, postcode: {df8['postcode'].unique()[0]}\")\n", "df8 = df5.copy()\n", - "df8 = df8[df8['edp_site_id'] == site_ids].reset_index(drop=True)\n", - "start_time = t0 # In sydney local time\n", - "end_time = t1 # In sydney local time\n", + "df8 = df8[df8[\"edp_site_id\"] == site_ids].reset_index(drop=True)\n", + "start_time = t0 # In sydney local time\n", + "end_time = t1 # In sydney local time\n", "\n", "\n", - "num_ticks = 24*2+1\n", - "save_as = 'Figures/EDP_voltvar_19Nov.jpeg'\n", - "x_label = 'time'\n", - "y_labels = ['Reactive power (kvar)', 'Reactive power (kvar)','Reactive power (kvar)', 'voltage (V)']\n", - "plt_config = {'Q_voltvar_max': [0, 0, '-.', None, None], 'Q_voltvar_min': [0, 0, '-.', None, None], 'reactive_power': [0, 0, '-', None, None],\n", - "'voltage_avg': [0, 1, '-', None, None]}\n", + "num_ticks = 24 * 2 + 1\n", + "save_as = \"Figures/EDP_voltvar_19Nov.jpeg\"\n", + "x_label = \"time\"\n", + "y_labels = [\n", + " \"Reactive power (kvar)\",\n", + " \"Reactive power (kvar)\",\n", + " \"Reactive power (kvar)\",\n", + " \"voltage (V)\",\n", + "]\n", + "plt_config = {\n", + " \"Q_voltvar_max\": [0, 0, \"-.\", None, None],\n", + " \"Q_voltvar_min\": [0, 0, \"-.\", None, None],\n", + " \"reactive_power\": [0, 0, \"-\", None, None],\n", + " \"voltage_avg\": [0, 1, \"-\", None, None],\n", + "}\n", "\n", - "color_nights=False\n", + "color_nights = False\n", "# color_by = 'group'\n", - "color_by = 'attribute'\n", - "ax_digit = '1.1f'\n", - "a=my_plot4(start_time, end_time, df8, plt_config=plt_config, ax_digit= ax_digit,\n", - " group_attr='edp_site_id', time_attr='time', color_nights=color_nights,cmap='viridis',\n", - " figsize=[14/2.54,2], same_scale=1, fontsize=7, fontname='Times New Roman', plot_total=False, plot_total_func=['sum', [lambda x: max(x), 'max']], \n", - " num_ticks=num_ticks, num_yticks=10, dpi=300, x_format= '%H:%M', \n", - " legend_loc=['lower left', 'center right', 'upper right'], x_label=x_label, y_labels=y_labels, color_by=color_by,\n", - "plot_period=np.timedelta64(1, 'D'), save_as=save_as, rotation = 60, step=0, gridwidth=[0.2, .2], legend_join='-', title='', legend_i=0, title_i=0, only1title=0)\n", + "color_by = \"attribute\"\n", + "ax_digit = \"1.1f\"\n", + "a = my_plot4(\n", + " start_time,\n", + " end_time,\n", + " df8,\n", + " plt_config=plt_config,\n", + " ax_digit=ax_digit,\n", + " group_attr=\"edp_site_id\",\n", + " time_attr=\"time\",\n", + " color_nights=color_nights,\n", + " cmap=\"viridis\",\n", + " figsize=[14 / 2.54, 2],\n", + " same_scale=1,\n", + " fontsize=7,\n", + " fontname=\"Times New Roman\",\n", + " plot_total=False,\n", + " plot_total_func=[\"sum\", [lambda x: max(x), \"max\"]],\n", + " num_ticks=num_ticks,\n", + " num_yticks=10,\n", + " dpi=300,\n", + " x_format=\"%H:%M\",\n", + " legend_loc=[\"lower left\", \"center right\", \"upper right\"],\n", + " x_label=x_label,\n", + " y_labels=y_labels,\n", + " color_by=color_by,\n", + " plot_period=np.timedelta64(1, \"D\"),\n", + " save_as=save_as,\n", + " rotation=60,\n", + " step=0,\n", + " gridwidth=[0.2, 0.2],\n", + " legend_join=\"-\",\n", + " title=\"\",\n", + " legend_i=0,\n", + " title_i=0,\n", + " only1title=0,\n", + ")\n", "a.do()" ] }, @@ -346,36 +486,77 @@ ], "source": [ "df8 = df5.copy()\n", - "df8 = df8[df8['edp_site_id'].isin(df8['edp_site_id'].unique()[2:3])]\n", - "df8 = df8.merge(solar, on=['time', 'postcode'], how='left')\n", - "df8 = df8.sort_values(['postcode', 'time']).groupby('postcode').apply(lambda group: group.set_index('time').interpolate(method='time', limit=1).reset_index()).reset_index(drop=True)\n", + "df8 = df8[df8[\"edp_site_id\"].isin(df8[\"edp_site_id\"].unique()[2:3])]\n", + "df8 = df8.merge(solar, on=[\"time\", \"postcode\"], how=\"left\")\n", + "df8 = (\n", + " df8.sort_values([\"postcode\", \"time\"])\n", + " .groupby(\"postcode\")\n", + " .apply(\n", + " lambda group: (\n", + " group.set_index(\"time\").interpolate(method=\"time\", limit=1).reset_index()\n", + " )\n", + " )\n", + " .reset_index(drop=True)\n", + ")\n", "\n", - "start_time = '2022-11-19 00:00:00+1030' # In sydney local time\n", - "end_time = '2022-11-20 00:00:00+1030' # In sydney local time\n", + "start_time = \"2022-11-19 00:00:00+1030\" # In sydney local time\n", + "end_time = \"2022-11-20 00:00:00+1030\" # In sydney local time\n", "\n", - "num_ticks = 2*24+1\n", - "save_as = 'Figures/Solar_clouds_active_power19Nov.jpeg'\n", - "x_label = 'time'\n", + "num_ticks = 2 * 24 + 1\n", + "save_as = \"Figures/Solar_clouds_active_power19Nov.jpeg\"\n", + "x_label = \"time\"\n", "y_labels = [\n", - " 'GHI ($\\mathdefault{W/m^2}$)', \\\n", - " 'Cloud type',\n", - " 'Active power (kW)', ]\n", + " \"GHI ($\\mathdefault{W/m^2}$)\",\n", + " \"Cloud type\",\n", + " \"Active power (kW)\",\n", + "]\n", "plt_config = {\n", - " 'GHI': [0, 0, '-', None, None],\n", - " 'cloud_type': [0, 1, '-', None, None],\n", - " # 'DNI': [0, 0, '-', None, None],\n", - " 'active_power': [1, 0, '-', None, None],\n", - " }\n", - "color_nights=False\n", + " \"GHI\": [0, 0, \"-\", None, None],\n", + " \"cloud_type\": [0, 1, \"-\", None, None],\n", + " # 'DNI': [0, 0, '-', None, None],\n", + " \"active_power\": [1, 0, \"-\", None, None],\n", + "}\n", + "color_nights = False\n", "# color_by = 'group'\n", - "color_by = 'attribute'\n", - "ax_digit = '1.2f'\n", - "a=my_plot4(start_time, end_time, df8, plt_config=plt_config, ax_digit= ax_digit,\n", - " group_attr='edp_site_id', time_attr='time', color_nights=color_nights,cmap='turbo',\n", - " figsize=[14/2.54,1.25], same_scale=1, fontsize=7, fontname='Times New Roman', plot_total=False, plot_total_func=['sum', [lambda x: max(x), 'max']], \n", - " num_ticks=num_ticks, num_yticks=7, dpi=300, x_format= '%H:%M', \n", - " legend_loc=['center left', 'center right', 'center left'], x_label=x_label, y_labels=y_labels, color_by=color_by,\n", - "plot_period=np.timedelta64(1, 'D'), save_as=save_as, hspace=0,rotation = 60, step=0, gridwidth=[0.2, .2], legend_join='-', title='', legend_i=0, title_i=0, only1title=1, onlyntime=1)\n", + "color_by = \"attribute\"\n", + "ax_digit = \"1.2f\"\n", + "a = my_plot4(\n", + " start_time,\n", + " end_time,\n", + " df8,\n", + " plt_config=plt_config,\n", + " ax_digit=ax_digit,\n", + " group_attr=\"edp_site_id\",\n", + " time_attr=\"time\",\n", + " color_nights=color_nights,\n", + " cmap=\"turbo\",\n", + " figsize=[14 / 2.54, 1.25],\n", + " same_scale=1,\n", + " fontsize=7,\n", + " fontname=\"Times New Roman\",\n", + " plot_total=False,\n", + " plot_total_func=[\"sum\", [lambda x: max(x), \"max\"]],\n", + " num_ticks=num_ticks,\n", + " num_yticks=7,\n", + " dpi=300,\n", + " x_format=\"%H:%M\",\n", + " legend_loc=[\"center left\", \"center right\", \"center left\"],\n", + " x_label=x_label,\n", + " y_labels=y_labels,\n", + " color_by=color_by,\n", + " plot_period=np.timedelta64(1, \"D\"),\n", + " save_as=save_as,\n", + " hspace=0,\n", + " rotation=60,\n", + " step=0,\n", + " gridwidth=[0.2, 0.2],\n", + " legend_join=\"-\",\n", + " title=\"\",\n", + " legend_i=0,\n", + " title_i=0,\n", + " only1title=1,\n", + " onlyntime=1,\n", + ")\n", "a.do()" ] } diff --git a/tests/EDP_process.ipynb b/tests/EDP_process.ipynb index d7dda52..4570045 100755 --- a/tests/EDP_process.ipynb +++ b/tests/EDP_process.ipynb @@ -9,9 +9,11 @@ "%reload_ext autoreload\n", "%autoreload 2\n", "import sys\n", - "sys.path.append('../') \n", - "from visualisation import *\n", - "from epd_process_funcs import *\n" + "\n", + "sys.path.append(\"../\")\n", + "from epd_process_funcs import *\n", + "\n", + "from visualisation import *" ] }, { @@ -21,12 +23,12 @@ "outputs": [], "source": [ "# solar=pd.read_csv('4) Data/BOM_NCI/2023/NCI_processed_Adelaide_grouped.csv')\n", - "solar=pd.read_csv('/home/hossein/CICCADA/BOM_NCI/2023/NCI_processed_grouped_all.csv')\n", - "solar['time'] = pd.to_datetime(solar['time'])\n", - "solar['time'] = solar['time'].dt.tz_localize('utc')\n", - "solar['time'] = solar['time'].dt.tz_convert(pytz.FixedOffset(9.5*60))\n", - "solar['postcode'] = solar['postcode'].astype(int)\n", - "solar.rename(columns={'surface_global_irradiance': 'GHI'}, inplace=True)\n", + "solar = pd.read_csv(\"/home/hossein/CICCADA/BOM_NCI/2023/NCI_processed_grouped_all.csv\")\n", + "solar[\"time\"] = pd.to_datetime(solar[\"time\"])\n", + "solar[\"time\"] = solar[\"time\"].dt.tz_localize(\"utc\")\n", + "solar[\"time\"] = solar[\"time\"].dt.tz_convert(pytz.FixedOffset(9.5 * 60))\n", + "solar[\"postcode\"] = solar[\"postcode\"].astype(int)\n", + "solar.rename(columns={\"surface_global_irradiance\": \"GHI\"}, inplace=True)\n", "solar_bk = solar.copy()" ] }, @@ -37,8 +39,10 @@ "outputs": [], "source": [ "edp_path = \"4) Data/EDP SA 2023 Data\"\n", - "edp_files = glob(edp_path+\"/SA_site_edp_2023_S*.csv\")\n", - "df = pd.concat([pd.read_csv(i) for i in edp_files if os.path.getsize(i) > 0]).reset_index(drop=True) \n" + "edp_files = glob(edp_path + \"/SA_site_edp_2023_S*.csv\")\n", + "df = pd.concat(\n", + " [pd.read_csv(i) for i in edp_files if os.path.getsize(i) > 0]\n", + ").reset_index(drop=True)" ] }, { @@ -176,7 +180,7 @@ " order by num_rows desc\n", " \\\n", "\"\"\").to_df()\n", - "db['max_t'] = pd.to_datetime(db['max_t']).dt.tz_localize(pytz.FixedOffset(9.5*60))\n", + "db[\"max_t\"] = pd.to_datetime(db[\"max_t\"]).dt.tz_localize(pytz.FixedOffset(9.5 * 60))\n", "db" ] }, @@ -315,13 +319,26 @@ "metadata": {}, "outputs": [], "source": [ - "meta_data1= pd.read_csv(edp_path+\"/edp_sites_metadata_sa_postcode.csv\")\n", - "meta_data2 = pd.read_csv(edp_path+\"/edp_sites_metadata59239829.csv\")\n", - "meta_data2 = meta_data2[meta_data2['state'] == 'SA']\n", - "meta_data3 = meta_data2.merge(meta_data1[['edp_site_id', 'postcode']], on='edp_site_id', how='left')\n", - "meta_data3['Srated'] = meta_data3['inverter_ac_rating_kw']*meta_data3['inverter_count']\n", - "meta_data3['Srated'] = meta_data3.apply(lambda row: row['inverter_ac_rating_kw'] if pd.isna(row['inverter_count']) else row['Srated'], axis=1)\n", - "meta_data2 = meta_data3.groupby(['edp_site_id', 'postcode']).agg({'Srated':'sum'}).reset_index()\n", + "meta_data1 = pd.read_csv(edp_path + \"/edp_sites_metadata_sa_postcode.csv\")\n", + "meta_data2 = pd.read_csv(edp_path + \"/edp_sites_metadata59239829.csv\")\n", + "meta_data2 = meta_data2[meta_data2[\"state\"] == \"SA\"]\n", + "meta_data3 = meta_data2.merge(\n", + " meta_data1[[\"edp_site_id\", \"postcode\"]], on=\"edp_site_id\", how=\"left\"\n", + ")\n", + "meta_data3[\"Srated\"] = (\n", + " meta_data3[\"inverter_ac_rating_kw\"] * meta_data3[\"inverter_count\"]\n", + ")\n", + "meta_data3[\"Srated\"] = meta_data3.apply(\n", + " lambda row: (\n", + " row[\"inverter_ac_rating_kw\"]\n", + " if pd.isna(row[\"inverter_count\"])\n", + " else row[\"Srated\"]\n", + " ),\n", + " axis=1,\n", + ")\n", + "meta_data2 = (\n", + " meta_data3.groupby([\"edp_site_id\", \"postcode\"]).agg({\"Srated\": \"sum\"}).reset_index()\n", + ")\n", "# meta_data.head()[:2]\n", "# df[:2]" ] @@ -333,12 +350,20 @@ "outputs": [], "source": [ "df7 = df.query(f\"circuit_label=='pv_site_net'\")\n", - "df7 = df7.sort_values(by=['edp_site_id', 'datetime']).reset_index(drop=True)\n", - "df7 = df7.drop_duplicates(subset=['edp_site_id', 'edp_circuit_label'], keep='first').reset_index(drop=True)\n", - "df7_counts = df7.groupby('edp_site_id').count().reset_index()[['edp_site_id', 'edp_device_and_circuit']]\n", - "df7_counts = df7_counts.rename(columns={'edp_device_and_circuit': 'count_circuits'})\n", - "df7 = df7.merge(df7_counts, on='edp_site_id', how='left')\n", - "df7.drop(columns=['voltage_max', 'voltage_min', 'current_min', 'current_max'], inplace=True)\n", + "df7 = df7.sort_values(by=[\"edp_site_id\", \"datetime\"]).reset_index(drop=True)\n", + "df7 = df7.drop_duplicates(\n", + " subset=[\"edp_site_id\", \"edp_circuit_label\"], keep=\"first\"\n", + ").reset_index(drop=True)\n", + "df7_counts = (\n", + " df7.groupby(\"edp_site_id\")\n", + " .count()\n", + " .reset_index()[[\"edp_site_id\", \"edp_device_and_circuit\"]]\n", + ")\n", + "df7_counts = df7_counts.rename(columns={\"edp_device_and_circuit\": \"count_circuits\"})\n", + "df7 = df7.merge(df7_counts, on=\"edp_site_id\", how=\"left\")\n", + "df7.drop(\n", + " columns=[\"voltage_max\", \"voltage_min\", \"current_min\", \"current_max\"], inplace=True\n", + ")\n", "df7 = df7.query(f\"count_circuits > 1\")\n", "# df7 = df7.query(f\"count_circuits ==2\")\n", "# df7 = df7.query(f\"edp_site_id=='S0343'\")" @@ -350,7 +375,6 @@ "metadata": {}, "outputs": [], "source": [ - "\n", "# df7.merge(meta_data2, on='edp_site_id', how='left').drop(columns=['unix_time', 'state','limit_enabled','has_battery','monitoring_hardware','islandable','limit_amount','limit_applied','site_timezone','last_date_metadata_received','battery_size_make_model','datetime','edp_device_and_circuit','circuit_label','first_date_metadata_received'])" ] }, @@ -520,7 +544,7 @@ } ], "source": [ - "meta_data3[['edp_site_id','inverter_ac_rating_kw','inverter_count','Srated']]" + "meta_data3[[\"edp_site_id\", \"inverter_ac_rating_kw\", \"inverter_count\", \"Srated\"]]" ] }, { @@ -580,7 +604,15 @@ } ], "source": [ - "meta_data3.query(f\"edp_site_id=='S0178'\")[['edp_site_id','inverter_ac_rating_kw', 'inverter_count', 'subarray_model', 'Srated']]" + "meta_data3.query(f\"edp_site_id=='S0178'\")[\n", + " [\n", + " \"edp_site_id\",\n", + " \"inverter_ac_rating_kw\",\n", + " \"inverter_count\",\n", + " \"subarray_model\",\n", + " \"Srated\",\n", + " ]\n", + "]" ] }, { @@ -706,7 +738,16 @@ } ], "source": [ - "meta_data3.query(f\"edp_site_id=='S0235'\")[['edp_site_id','inverter_ac_rating_kw', 'inverter_count', 'subarray_model', 'subarray_tilt', 'subarray_orientation']]" + "meta_data3.query(f\"edp_site_id=='S0235'\")[\n", + " [\n", + " \"edp_site_id\",\n", + " \"inverter_ac_rating_kw\",\n", + " \"inverter_count\",\n", + " \"subarray_model\",\n", + " \"subarray_tilt\",\n", + " \"subarray_orientation\",\n", + " ]\n", + "]" ] }, { @@ -857,9 +898,15 @@ "source": [ "df1 = df5.copy()\n", "df1 = df1.query(f\"edp_site_id == 'S0235'\").reset_index(drop=True)\n", - "offset = df1['time'].dt.strftime('%z').str[:][0]\n", - "df3 = df1.merge(solar, on=['time', 'postcode'], how='left')\n", - "df1['time'].min(), df1['time'].max(), offset, df1['active_power'].max(), df1['Srated'].max()" + "offset = df1[\"time\"].dt.strftime(\"%z\").str[:][0]\n", + "df3 = df1.merge(solar, on=[\"time\", \"postcode\"], how=\"left\")\n", + "(\n", + " df1[\"time\"].min(),\n", + " df1[\"time\"].max(),\n", + " offset,\n", + " df1[\"active_power\"].max(),\n", + " df1[\"Srated\"].max(),\n", + ")" ] }, { @@ -919,7 +966,15 @@ } ], "source": [ - "meta_data3.query(f\"edp_site_id=='S0178'\")[['edp_site_id','inverter_ac_rating_kw', 'inverter_count', 'subarray_model', 'Srated']]" + "meta_data3.query(f\"edp_site_id=='S0178'\")[\n", + " [\n", + " \"edp_site_id\",\n", + " \"inverter_ac_rating_kw\",\n", + " \"inverter_count\",\n", + " \"subarray_model\",\n", + " \"Srated\",\n", + " ]\n", + "]" ] }, { @@ -1024,7 +1079,7 @@ } ], "source": [ - "df7.query(f\"edp_site_id=='S0178'\")\n" + "df7.query(f\"edp_site_id=='S0178'\")" ] }, { @@ -1066,8 +1121,7 @@ } ], "source": [ - "\n", - "df5 = process_edp(df, meta_data2, offset=9.5*60)" + "df5 = process_edp(df, meta_data2, offset=9.5 * 60)" ] }, { @@ -1085,7 +1139,9 @@ } ], "source": [ - "print(f\"No. of all sites: {df5['edp_site_id'].unique().shape[0]}, \\nNo. of wrong sites based on maxP: {df5.query(f'wrong_on_maxP==True')['edp_site_id'].unique().shape[0]}\")" + "print(\n", + " f\"No. of all sites: {df5['edp_site_id'].unique().shape[0]}, \\nNo. of wrong sites based on maxP: {df5.query(f'wrong_on_maxP==True')['edp_site_id'].unique().shape[0]}\"\n", + ")" ] }, { @@ -1208,7 +1264,9 @@ } ], "source": [ - "df5.groupby('edp_site_id').agg({'Q_noncomp': lambda x:sum(x)/12, 'wrong_on_maxP':'first'}).query(f\"Q_noncomp > 0\").sort_values('Q_noncomp', ascending=False)" + "df5.groupby(\"edp_site_id\").agg(\n", + " {\"Q_noncomp\": lambda x: sum(x) / 12, \"wrong_on_maxP\": \"first\"}\n", + ").query(f\"Q_noncomp > 0\").sort_values(\"Q_noncomp\", ascending=False)" ] }, { @@ -1228,7 +1286,7 @@ } ], "source": [ - "df5.query(f\"edp_site_id=='S0463'\")['active_power'].sum()/12" + "df5.query(f\"edp_site_id=='S0463'\")[\"active_power\"].sum() / 12" ] }, { @@ -1342,7 +1400,9 @@ } ], "source": [ - "df5.query(f\"voltage_avg > 253\").groupby('edp_site_id').agg({'P_noncomp': 'sum', 'wrong_on_maxP':'first'}).sort_values('P_noncomp', ascending=False).head(10)" + "df5.query(f\"voltage_avg > 253\").groupby(\"edp_site_id\").agg(\n", + " {\"P_noncomp\": \"sum\", \"wrong_on_maxP\": \"first\"}\n", + ").sort_values(\"P_noncomp\", ascending=False).head(10)" ] }, { @@ -1465,7 +1525,9 @@ } ], "source": [ - "df5.groupby('edp_site_id').agg({'P_noncomp': 'sum', 'wrong_on_maxP':'first'}).sort_values('P_noncomp', ascending=False)" + "df5.groupby(\"edp_site_id\").agg(\n", + " {\"P_noncomp\": \"sum\", \"wrong_on_maxP\": \"first\"}\n", + ").sort_values(\"P_noncomp\", ascending=False)" ] }, { @@ -1496,8 +1558,10 @@ } ], "source": [ - "print('Num odd voltage_avg values vs num sites')\n", - "df5.groupby('edp_site_id').apply(lambda x:x['voltage_avg'].isna().sum()).value_counts()# " + "print(\"Num odd voltage_avg values vs num sites\")\n", + "df5.groupby(\"edp_site_id\").apply(\n", + " lambda x: x[\"voltage_avg\"].isna().sum()\n", + ").value_counts() #" ] }, { @@ -2216,8 +2280,8 @@ "source": [ "df6 = df5.copy()\n", "# df6 = df6[df['edp_site_id'].isin(df6['edp_site_id'].unique())].reset_index(drop=True)\n", - "df6['DP'] = df6.groupby(['edp_site_id'])['active_power'].transform(lambda x: x.diff())\n", - "df6['DP_r'] = df6['DP'].clip(upper=0).round()\n" + "df6[\"DP\"] = df6.groupby([\"edp_site_id\"])[\"active_power\"].transform(lambda x: x.diff())\n", + "df6[\"DP_r\"] = df6[\"DP\"].clip(upper=0).round()" ] }, { @@ -2269,9 +2333,15 @@ "source": [ "df1 = df5.copy()\n", "df1 = df1.query(f\"edp_site_id == 'S0235'\").reset_index(drop=True)\n", - "offset = df1['time'].dt.strftime('%z').str[:][0]\n", - "df3 = df1.merge(solar, on=['time', 'postcode'], how='left')\n", - "df1['time'].min(), df1['time'].max(), offset, df1['active_power'].max(), df1['Srated'].max()" + "offset = df1[\"time\"].dt.strftime(\"%z\").str[:][0]\n", + "df3 = df1.merge(solar, on=[\"time\", \"postcode\"], how=\"left\")\n", + "(\n", + " df1[\"time\"].min(),\n", + " df1[\"time\"].max(),\n", + " offset,\n", + " df1[\"active_power\"].max(),\n", + " df1[\"Srated\"].max(),\n", + ")" ] }, { @@ -2291,7 +2361,7 @@ } ], "source": [ - "df1['real_power'].max()" + "df1[\"real_power\"].max()" ] }, { @@ -2347,7 +2417,9 @@ " S = P / pf\n", " Q = np.sqrt(S**2 - P**2)\n", " return Q\n", - "get_Q(.6, .8)" + "\n", + "\n", + "get_Q(0.6, 0.8)" ] }, { @@ -2367,7 +2439,7 @@ } ], "source": [ - "df3['postcode'].unique()" + "df3[\"postcode\"].unique()" ] }, { @@ -2398,7 +2470,7 @@ } ], "source": [ - "meta_data['inverter_model'].value_counts().head(10)" + "meta_data[\"inverter_model\"].value_counts().head(10)" ] }, { @@ -2642,7 +2714,7 @@ } ], "source": [ - "'2023-01-01 00:00:00'+offset # In sydney local time" + "\"2023-01-01 00:00:00\" + offset # In sydney local time" ] }, { @@ -2662,30 +2734,69 @@ } ], "source": [ - "start_time = '2023-01-01 00:00:00'+offset # In sydney local time\n", - "end_time = '2023-01-03 00:00:00'+offset # In sydney local time\n", + "start_time = \"2023-01-01 00:00:00\" + offset # In sydney local time\n", + "end_time = \"2023-01-03 00:00:00\" + offset # In sydney local time\n", "\n", "num_ticks = 73\n", - "save_as = ''\n", - "x_label = 'time'\n", - "y_labels = ['Reactive power (kW)', 'Reactive power (kW)', 'Reactive power (kVAr)',\\\n", - " 'Active power (kW)', 'Active power (kW)', \\\n", - " 'voltage (V)']\n", - "plt_config = {'Q_max': [1, 0, '-', None, None], 'Q_min': [1, 0, '-', None, None], 'reactive_power': [1, 0, '-', None, None],\\\n", - " 'P_threshold': [0, 0, '-', None, None],'active_power': [0, 0, '-', None, None],\n", - " 'voltage_avg': [0, 1, '-', None, None]}\n", + "save_as = \"\"\n", + "x_label = \"time\"\n", + "y_labels = [\n", + " \"Reactive power (kW)\",\n", + " \"Reactive power (kW)\",\n", + " \"Reactive power (kVAr)\",\n", + " \"Active power (kW)\",\n", + " \"Active power (kW)\",\n", + " \"voltage (V)\",\n", + "]\n", + "plt_config = {\n", + " \"Q_max\": [1, 0, \"-\", None, None],\n", + " \"Q_min\": [1, 0, \"-\", None, None],\n", + " \"reactive_power\": [1, 0, \"-\", None, None],\n", + " \"P_threshold\": [0, 0, \"-\", None, None],\n", + " \"active_power\": [0, 0, \"-\", None, None],\n", + " \"voltage_avg\": [0, 1, \"-\", None, None],\n", + "}\n", "# y_labels = ['Active power (kW)', 'Apparent power (kVA)']\n", "# plt_config = {'active_power': [0, 0, '-', -2.5, 4.5], 'apparent_power': [1, 1, '-', None, None]}\n", - "color_nights=False\n", + "color_nights = False\n", "# color_by = 'group'\n", - "color_by = 'attribute'\n", - "ax_digit = '1.2f'\n", - "a=my_plot4(start_time, end_time, df3, plt_config=plt_config, ax_digit= ax_digit,\n", - " group_attr='edp_site_id', time_attr='time', color_nights=color_nights,cmap='viridis',\n", - " figsize=[17/2.54,2.5], same_scale=1, fontsize=7, fontname='Times New Roman', plot_total=False, plot_total_func=['sum', [lambda x: max(x), 'max']], \n", - " num_ticks=num_ticks, num_yticks=10, dpi=300, x_format= '%H:%M', \n", - " legend_loc=['lower left', 'upper right', 'lower left', 'upper right'], x_label=x_label, y_labels=y_labels, color_by=color_by,\n", - "plot_period=np.timedelta64(1, 'D'), save_as=save_as, rotation = 60, step=0, gridwidth=[0.5, .5], legend_join='-', title='', legend_i=0, title_i=0, only1title=0)\n", + "color_by = \"attribute\"\n", + "ax_digit = \"1.2f\"\n", + "a = my_plot4(\n", + " start_time,\n", + " end_time,\n", + " df3,\n", + " plt_config=plt_config,\n", + " ax_digit=ax_digit,\n", + " group_attr=\"edp_site_id\",\n", + " time_attr=\"time\",\n", + " color_nights=color_nights,\n", + " cmap=\"viridis\",\n", + " figsize=[17 / 2.54, 2.5],\n", + " same_scale=1,\n", + " fontsize=7,\n", + " fontname=\"Times New Roman\",\n", + " plot_total=False,\n", + " plot_total_func=[\"sum\", [lambda x: max(x), \"max\"]],\n", + " num_ticks=num_ticks,\n", + " num_yticks=10,\n", + " dpi=300,\n", + " x_format=\"%H:%M\",\n", + " legend_loc=[\"lower left\", \"upper right\", \"lower left\", \"upper right\"],\n", + " x_label=x_label,\n", + " y_labels=y_labels,\n", + " color_by=color_by,\n", + " plot_period=np.timedelta64(1, \"D\"),\n", + " save_as=save_as,\n", + " rotation=60,\n", + " step=0,\n", + " gridwidth=[0.5, 0.5],\n", + " legend_join=\"-\",\n", + " title=\"\",\n", + " legend_i=0,\n", + " title_i=0,\n", + " only1title=0,\n", + ")\n", "a.do()" ] }, @@ -2706,30 +2817,69 @@ } ], "source": [ - "start_time = '2023-01-01 00:00:00'+offset # In sydney local time\n", - "end_time = '2023-01-03 00:00:00'+offset # In sydney local time\n", + "start_time = \"2023-01-01 00:00:00\" + offset # In sydney local time\n", + "end_time = \"2023-01-03 00:00:00\" + offset # In sydney local time\n", "\n", "num_ticks = 73\n", - "save_as = ''\n", - "x_label = 'time'\n", - "y_labels = ['Reactive power (kW)', 'Reactive power (kW)', 'Reactive power (kVAr)',\\\n", - " 'Active power (kW)', 'Active power (kW)', \\\n", - " 'voltage (V)']\n", - "plt_config = {'Q_max': [1, 0, '-', None, None], 'Q_min': [1, 0, '-', None, None], 'reactive_power': [1, 0, '-', None, None],\\\n", - " 'P_threshold': [0, 0, '-', None, None],'active_power': [0, 0, '-', None, None],\n", - " 'voltage_avg': [0, 1, '-', None, None]}\n", + "save_as = \"\"\n", + "x_label = \"time\"\n", + "y_labels = [\n", + " \"Reactive power (kW)\",\n", + " \"Reactive power (kW)\",\n", + " \"Reactive power (kVAr)\",\n", + " \"Active power (kW)\",\n", + " \"Active power (kW)\",\n", + " \"voltage (V)\",\n", + "]\n", + "plt_config = {\n", + " \"Q_max\": [1, 0, \"-\", None, None],\n", + " \"Q_min\": [1, 0, \"-\", None, None],\n", + " \"reactive_power\": [1, 0, \"-\", None, None],\n", + " \"P_threshold\": [0, 0, \"-\", None, None],\n", + " \"active_power\": [0, 0, \"-\", None, None],\n", + " \"voltage_avg\": [0, 1, \"-\", None, None],\n", + "}\n", "# y_labels = ['Active power (kW)', 'Apparent power (kVA)']\n", "# plt_config = {'active_power': [0, 0, '-', -2.5, 4.5], 'apparent_power': [1, 1, '-', None, None]}\n", - "color_nights=False\n", + "color_nights = False\n", "# color_by = 'group'\n", - "color_by = 'attribute'\n", - "ax_digit = '1.2f'\n", - "a=my_plot4(start_time, end_time, df3, plt_config=plt_config, ax_digit= ax_digit,\n", - " group_attr='edp_site_id', time_attr='time', color_nights=color_nights,cmap='viridis',\n", - " figsize=[17/2.54,2.5], same_scale=1, fontsize=7, fontname='Times New Roman', plot_total=False, plot_total_func=['sum', [lambda x: max(x), 'max']], \n", - " num_ticks=num_ticks, num_yticks=10, dpi=300, x_format= '%H:%M', \n", - " legend_loc=['lower left', 'upper right', 'lower left', 'upper right'], x_label=x_label, y_labels=y_labels, color_by=color_by,\n", - "plot_period=np.timedelta64(1, 'D'), save_as=save_as, rotation = 60, step=0, gridwidth=[0.5, .5], legend_join='-', title='', legend_i=0, title_i=0, only1title=0)\n", + "color_by = \"attribute\"\n", + "ax_digit = \"1.2f\"\n", + "a = my_plot4(\n", + " start_time,\n", + " end_time,\n", + " df3,\n", + " plt_config=plt_config,\n", + " ax_digit=ax_digit,\n", + " group_attr=\"edp_site_id\",\n", + " time_attr=\"time\",\n", + " color_nights=color_nights,\n", + " cmap=\"viridis\",\n", + " figsize=[17 / 2.54, 2.5],\n", + " same_scale=1,\n", + " fontsize=7,\n", + " fontname=\"Times New Roman\",\n", + " plot_total=False,\n", + " plot_total_func=[\"sum\", [lambda x: max(x), \"max\"]],\n", + " num_ticks=num_ticks,\n", + " num_yticks=10,\n", + " dpi=300,\n", + " x_format=\"%H:%M\",\n", + " legend_loc=[\"lower left\", \"upper right\", \"lower left\", \"upper right\"],\n", + " x_label=x_label,\n", + " y_labels=y_labels,\n", + " color_by=color_by,\n", + " plot_period=np.timedelta64(1, \"D\"),\n", + " save_as=save_as,\n", + " rotation=60,\n", + " step=0,\n", + " gridwidth=[0.5, 0.5],\n", + " legend_join=\"-\",\n", + " title=\"\",\n", + " legend_i=0,\n", + " title_i=0,\n", + " only1title=0,\n", + ")\n", "a.do()" ] }, @@ -2750,27 +2900,67 @@ } ], "source": [ - "start_time = '2023-01-02 00:00:00'+offset # In sydney local time\n", - "end_time = '2023-01-03 00:00:00'+offset # In sydney local time\n", + "start_time = \"2023-01-02 00:00:00\" + offset # In sydney local time\n", + "end_time = \"2023-01-03 00:00:00\" + offset # In sydney local time\n", "\n", "num_ticks = 73\n", - "save_as = ''\n", - "x_label = 'time'\n", - "y_labels = ['Active power (kW)', 'Active power (kW)', 'Reactive power (kVAr)', 'voltage (V)', 'Apparent power (kVA)']\n", - "plt_config = {'P_threshold': [0, 0, '-', None, None], 'active_power': [0, 0, '-', None, None],'reactive_power': [1, 0, '-', None, None],\n", - "'voltage_avg': [1, 1, '-', None, None], 'apparent_power': [1, 1, '-', None, None]}\n", + "save_as = \"\"\n", + "x_label = \"time\"\n", + "y_labels = [\n", + " \"Active power (kW)\",\n", + " \"Active power (kW)\",\n", + " \"Reactive power (kVAr)\",\n", + " \"voltage (V)\",\n", + " \"Apparent power (kVA)\",\n", + "]\n", + "plt_config = {\n", + " \"P_threshold\": [0, 0, \"-\", None, None],\n", + " \"active_power\": [0, 0, \"-\", None, None],\n", + " \"reactive_power\": [1, 0, \"-\", None, None],\n", + " \"voltage_avg\": [1, 1, \"-\", None, None],\n", + " \"apparent_power\": [1, 1, \"-\", None, None],\n", + "}\n", "# y_labels = ['Active power (kW)', 'Apparent power (kVA)']\n", "# plt_config = {'active_power': [0, 0, '-', -2.5, 4.5]}\n", - "color_nights=False\n", + "color_nights = False\n", "# color_by = 'group'\n", - "color_by = 'attribute'\n", - "ax_digit = '1.1f'\n", - "a=my_plot4(start_time, end_time, df3, plt_config=plt_config, ax_digit= ax_digit,\n", - " group_attr='edp_site_id', time_attr='time', color_nights=color_nights,cmap='viridis',\n", - " figsize=[17/2.54,2.5], same_scale=1, fontsize=7, fontname='Times New Roman', plot_total=False, plot_total_func=['sum', [lambda x: max(x), 'max']], \n", - " num_ticks=num_ticks, num_yticks=10, dpi=300, x_format= '%H:%M', \n", - " legend_loc=['lower left', 'upper right', 'lower left', 'upper right'], x_label=x_label, y_labels=y_labels, color_by=color_by,\n", - "plot_period=np.timedelta64(1, 'D'), save_as=save_as, rotation = 60, step=0, gridwidth=[0.5, .5], legend_join='-', title='', legend_i=0, title_i=0, only1title=0)\n", + "color_by = \"attribute\"\n", + "ax_digit = \"1.1f\"\n", + "a = my_plot4(\n", + " start_time,\n", + " end_time,\n", + " df3,\n", + " plt_config=plt_config,\n", + " ax_digit=ax_digit,\n", + " group_attr=\"edp_site_id\",\n", + " time_attr=\"time\",\n", + " color_nights=color_nights,\n", + " cmap=\"viridis\",\n", + " figsize=[17 / 2.54, 2.5],\n", + " same_scale=1,\n", + " fontsize=7,\n", + " fontname=\"Times New Roman\",\n", + " plot_total=False,\n", + " plot_total_func=[\"sum\", [lambda x: max(x), \"max\"]],\n", + " num_ticks=num_ticks,\n", + " num_yticks=10,\n", + " dpi=300,\n", + " x_format=\"%H:%M\",\n", + " legend_loc=[\"lower left\", \"upper right\", \"lower left\", \"upper right\"],\n", + " x_label=x_label,\n", + " y_labels=y_labels,\n", + " color_by=color_by,\n", + " plot_period=np.timedelta64(1, \"D\"),\n", + " save_as=save_as,\n", + " rotation=60,\n", + " step=0,\n", + " gridwidth=[0.5, 0.5],\n", + " legend_join=\"-\",\n", + " title=\"\",\n", + " legend_i=0,\n", + " title_i=0,\n", + " only1title=0,\n", + ")\n", "a.do()" ] }, @@ -2817,8 +3007,8 @@ "# ax_digit = '1.1f'\n", "# a=my_plot4(start_time, end_time, df3, plt_config=plt_config, ax_digit= ax_digit,\n", "# group_attr='edp_site_id', time_attr='time', color_nights=color_nights,cmap='viridis',\n", - "# figsize=[17/2.54,2.5], same_scale=1, fontsize=7, fontname='Times New Roman', plot_total=False, plot_total_func=['sum', [lambda x: max(x), 'max']], \n", - "# num_ticks=num_ticks, num_yticks=10, dpi=300, x_format= '%H:%M', \n", + "# figsize=[17/2.54,2.5], same_scale=1, fontsize=7, fontname='Times New Roman', plot_total=False, plot_total_func=['sum', [lambda x: max(x), 'max']],\n", + "# num_ticks=num_ticks, num_yticks=10, dpi=300, x_format= '%H:%M',\n", "# legend_loc=['lower left', 'upper right', 'lower left', 'upper right'], x_label=x_label, y_labels=y_labels, color_by=color_by,\n", "# plot_period=np.timedelta64(1, 'D'), save_as=save_as, rotation = 60, step=0, gridwidth=[0.5, .5], legend_join='-', title='', legend_i=0, title_i=0, only1title=0)\n", "# a.do()" @@ -2847,8 +3037,8 @@ "# ax_digit = '1.1f'\n", "# a=my_plot4(start_time, end_time, df3, plt_config=plt_config, ax_digit= ax_digit,\n", "# group_attr='edp_site_id', time_attr='time', color_nights=color_nights,cmap='viridis',\n", - "# figsize=[17/2.54,2.5], same_scale=1, fontsize=7, fontname='Times New Roman', plot_total=False, plot_total_func=['sum', [lambda x: max(x), 'max']], \n", - "# num_ticks=num_ticks, num_yticks=10, dpi=300, x_format= '%H:%M', \n", + "# figsize=[17/2.54,2.5], same_scale=1, fontsize=7, fontname='Times New Roman', plot_total=False, plot_total_func=['sum', [lambda x: max(x), 'max']],\n", + "# num_ticks=num_ticks, num_yticks=10, dpi=300, x_format= '%H:%M',\n", "# legend_loc=['lower left', 'upper right', 'lower left', 'upper right'], x_label=x_label, y_labels=y_labels, color_by=color_by,\n", "# plot_period=np.timedelta64(1, 'D'), save_as=save_as, rotation = 60, step=0, gridwidth=[0.5, .5], legend_join='-', title='', legend_i=0, title_i=0, only1title=0)\n", "# a.do()" @@ -2877,8 +3067,8 @@ "# ax_digit = '1.1f'\n", "# a=my_plot4(start_time, end_time, df3, plt_config=plt_config, ax_digit= ax_digit,\n", "# group_attr='edp_site_id', time_attr='time', color_nights=color_nights,cmap='viridis',\n", - "# figsize=[17/2.54,2.5], same_scale=1, fontsize=7, fontname='Times New Roman', plot_total=False, plot_total_func=['sum', [lambda x: max(x), 'max']], \n", - "# num_ticks=num_ticks, num_yticks=10, dpi=300, x_format= '%H:%M', \n", + "# figsize=[17/2.54,2.5], same_scale=1, fontsize=7, fontname='Times New Roman', plot_total=False, plot_total_func=['sum', [lambda x: max(x), 'max']],\n", + "# num_ticks=num_ticks, num_yticks=10, dpi=300, x_format= '%H:%M',\n", "# legend_loc=['lower left', 'upper right', 'lower left', 'upper right'], x_label=x_label, y_labels=y_labels, color_by=color_by,\n", "# plot_period=np.timedelta64(1, 'D'), save_as=save_as, rotation = 60, step=0, gridwidth=[0.5, .5], legend_join='-', title='', legend_i=0, title_i=0, only1title=0)\n", "# a.do()" @@ -2901,7 +3091,7 @@ } ], "source": [ - "df3['postcode'].unique()" + "df3[\"postcode\"].unique()" ] }, { @@ -2910,8 +3100,11 @@ "metadata": {}, "outputs": [], "source": [ - "\n", - "df44 = solar.query(f\"postcode==5012\").sort_values(by='time').reset_index(drop=True)[['time', 'postcode', 'surface_global_irradiance']]" + "df44 = (\n", + " solar.query(f\"postcode==5012\")\n", + " .sort_values(by=\"time\")\n", + " .reset_index(drop=True)[[\"time\", \"postcode\", \"surface_global_irradiance\"]]\n", + ")" ] }, { @@ -2920,7 +3113,11 @@ "metadata": {}, "outputs": [], "source": [ - "df11 = df1.query(f\"postcode==5012\").sort_values(by='time').reset_index(drop=True)[['time', 'postcode', 'real_energy', 'voltage_avg']]" + "df11 = (\n", + " df1.query(f\"postcode==5012\")\n", + " .sort_values(by=\"time\")\n", + " .reset_index(drop=True)[[\"time\", \"postcode\", \"real_energy\", \"voltage_avg\"]]\n", + ")" ] }, { @@ -3086,7 +3283,7 @@ } ], "source": [ - "df11.merge(df44, on=['time', 'postcode'], how='left')" + "df11.merge(df44, on=[\"time\", \"postcode\"], how=\"left\")" ] }, { @@ -3106,7 +3303,7 @@ } ], "source": [ - "df44['postcode'][0]" + "df44[\"postcode\"][0]" ] }, { @@ -3137,7 +3334,7 @@ } ], "source": [ - "df3.query(f\"postcode==5012\")['surface_global_irradiance']" + "df3.query(f\"postcode==5012\")[\"surface_global_irradiance\"]" ] }, { @@ -3146,7 +3343,7 @@ "metadata": {}, "outputs": [], "source": [ - "df3['surface_global_irradiance'] = df3['surface_global_irradiance'].fillna(0)" + "df3[\"surface_global_irradiance\"] = df3[\"surface_global_irradiance\"].fillna(0)" ] }, { @@ -3170,8 +3367,8 @@ "# ax_digit = '1.1f'\n", "# a=my_plot4(start_time, end_time, df3, plt_config=plt_config, ax_digit= ax_digit,\n", "# group_attr='edp_site_id', time_attr='time', color_nights=color_nights,cmap='RdYlBu',\n", - "# figsize=[17/2.54,2.], same_scale=False, fontsize=9, fontname='Times New Roman', plot_total=False, plot_total_func=['sum', [lambda x: max(x), 'max']], \n", - "# E2P_attr=None, num_ticks=num_ticks, num_yticks=5, dpi=300, special_legend=special_legend, x_format= '%H:%M', MW=False, \n", + "# figsize=[17/2.54,2.], same_scale=False, fontsize=9, fontname='Times New Roman', plot_total=False, plot_total_func=['sum', [lambda x: max(x), 'max']],\n", + "# E2P_attr=None, num_ticks=num_ticks, num_yticks=5, dpi=300, special_legend=special_legend, x_format= '%H:%M', MW=False,\n", "# legend_loc=['upper left', 'upper right'], x_label=x_label, y_labels=y_labels, kW2MW_attr=kW2MW_attr,\n", "# plot_period=np.timedelta64(1, 'D'), save_as=save_as, rotation = 25, step=True, gridwidth=[0.5, .5], legend_join='-', title='', legend_i=0, title_i=0, only1title=0)\n", "# a.do()" diff --git a/tests/EDP_solar.ipynb b/tests/EDP_solar.ipynb index 8691a56..bf295e6 100755 --- a/tests/EDP_solar.ipynb +++ b/tests/EDP_solar.ipynb @@ -10,9 +10,11 @@ "%reload_ext autoreload\n", "%autoreload 2\n", "import sys\n", - "sys.path.append('../') \n", - "from visualisation import *\n", - "from epd_process_funcs import *" + "\n", + "sys.path.append(\"../\")\n", + "from epd_process_funcs import *\n", + "\n", + "from visualisation import *" ] }, { @@ -22,12 +24,15 @@ "metadata": {}, "outputs": [], "source": [ - "solar=pd.read_csv('/home/hossein/CICCADA/BOM_NCI/2023/NCI_processed_grouped_all.csv')\n", - "solar['time'] = pd.to_datetime(solar['time'])\n", - "solar['time'] = solar['time'].dt.tz_localize('utc')\n", - "solar['time'] = solar['time'].dt.tz_convert(pytz.FixedOffset(8.5*60))\n", - "solar['postcode'] = solar['postcode'].astype(int)\n", - "solar.rename(columns={'surface_global_irradiance': 'GHI', 'direct_normal_irradiance': 'DNI'}, inplace=True)" + "solar = pd.read_csv(\"/home/hossein/CICCADA/BOM_NCI/2023/NCI_processed_grouped_all.csv\")\n", + "solar[\"time\"] = pd.to_datetime(solar[\"time\"])\n", + "solar[\"time\"] = solar[\"time\"].dt.tz_localize(\"utc\")\n", + "solar[\"time\"] = solar[\"time\"].dt.tz_convert(pytz.FixedOffset(8.5 * 60))\n", + "solar[\"postcode\"] = solar[\"postcode\"].astype(int)\n", + "solar.rename(\n", + " columns={\"surface_global_irradiance\": \"GHI\", \"direct_normal_irradiance\": \"DNI\"},\n", + " inplace=True,\n", + ")" ] }, { @@ -48,7 +53,7 @@ } ], "source": [ - "5068 in solar['postcode'].unique()" + "5068 in solar[\"postcode\"].unique()" ] }, { @@ -59,8 +64,10 @@ "outputs": [], "source": [ "edp_path = \"4) Data/EDP SA 2023 Data\"\n", - "edp_files = glob(edp_path+\"/SA_site_edp_2023_S*.csv\")\n", - "df = pd.concat([pd.read_csv(i) for i in edp_files if os.path.getsize(i) > 0]).reset_index(drop=True) " + "edp_files = glob(edp_path + \"/SA_site_edp_2023_S*.csv\")\n", + "df = pd.concat(\n", + " [pd.read_csv(i) for i in edp_files if os.path.getsize(i) > 0]\n", + ").reset_index(drop=True)" ] }, { @@ -70,13 +77,26 @@ "metadata": {}, "outputs": [], "source": [ - "meta_data1= pd.read_csv(edp_path+\"/edp_sites_metadata_sa_postcode.csv\")\n", - "meta_data2 = pd.read_csv(edp_path+\"/edp_sites_metadata59239829.csv\")\n", - "meta_data2 = meta_data2[meta_data2['state'] == 'SA']\n", - "meta_data3 = meta_data2.merge(meta_data1[['edp_site_id', 'postcode']], on='edp_site_id', how='left')\n", - "meta_data3['Srated'] = meta_data3['inverter_ac_rating_kw']*meta_data3['inverter_count']\n", - "meta_data3['Srated'] = meta_data3.apply(lambda row: row['inverter_ac_rating_kw'] if pd.isna(row['inverter_count']) else row['Srated'], axis=1)\n", - "meta_data2 = meta_data3.groupby(['edp_site_id', 'postcode']).agg({'Srated':'sum'}).reset_index()" + "meta_data1 = pd.read_csv(edp_path + \"/edp_sites_metadata_sa_postcode.csv\")\n", + "meta_data2 = pd.read_csv(edp_path + \"/edp_sites_metadata59239829.csv\")\n", + "meta_data2 = meta_data2[meta_data2[\"state\"] == \"SA\"]\n", + "meta_data3 = meta_data2.merge(\n", + " meta_data1[[\"edp_site_id\", \"postcode\"]], on=\"edp_site_id\", how=\"left\"\n", + ")\n", + "meta_data3[\"Srated\"] = (\n", + " meta_data3[\"inverter_ac_rating_kw\"] * meta_data3[\"inverter_count\"]\n", + ")\n", + "meta_data3[\"Srated\"] = meta_data3.apply(\n", + " lambda row: (\n", + " row[\"inverter_ac_rating_kw\"]\n", + " if pd.isna(row[\"inverter_count\"])\n", + " else row[\"Srated\"]\n", + " ),\n", + " axis=1,\n", + ")\n", + "meta_data2 = (\n", + " meta_data3.groupby([\"edp_site_id\", \"postcode\"]).agg({\"Srated\": \"sum\"}).reset_index()\n", + ")" ] }, { @@ -254,7 +274,7 @@ "metadata": {}, "outputs": [], "source": [ - "df5 = process_edp(df, meta_data2, 10.5*60)\n" + "df5 = process_edp(df, meta_data2, 10.5 * 60)" ] }, { @@ -275,7 +295,7 @@ } ], "source": [ - "solar['time'].max()" + "solar[\"time\"].max()" ] }, { @@ -304,36 +324,77 @@ ], "source": [ "df8 = df5.copy()\n", - "df8 = df8[df8['edp_site_id'].isin(df8['edp_site_id'].unique()[2:3])]\n", - "df8 = df8.merge(solar, on=['time', 'postcode'], how='left')\n", - "df8 = df8.sort_values(['postcode', 'time']).groupby('postcode').apply(lambda group: group.set_index('time').interpolate(method='time', limit=1).reset_index()).reset_index(drop=True)\n", + "df8 = df8[df8[\"edp_site_id\"].isin(df8[\"edp_site_id\"].unique()[2:3])]\n", + "df8 = df8.merge(solar, on=[\"time\", \"postcode\"], how=\"left\")\n", + "df8 = (\n", + " df8.sort_values([\"postcode\", \"time\"])\n", + " .groupby(\"postcode\")\n", + " .apply(\n", + " lambda group: (\n", + " group.set_index(\"time\").interpolate(method=\"time\", limit=1).reset_index()\n", + " )\n", + " )\n", + " .reset_index(drop=True)\n", + ")\n", "\n", - "start_time = '2023-01-02 00:00:00+1030' # In sydney local time\n", - "end_time = '2023-01-05 00:00:00+1030' # In sydney local time\n", + "start_time = \"2023-01-02 00:00:00+1030\" # In sydney local time\n", + "end_time = \"2023-01-05 00:00:00+1030\" # In sydney local time\n", "\n", - "num_ticks = 2*24+1\n", - "save_as = 'Figures/Solar_clouds_active_power.jpeg'\n", - "x_label = 'time'\n", + "num_ticks = 2 * 24 + 1\n", + "save_as = \"Figures/Solar_clouds_active_power.jpeg\"\n", + "x_label = \"time\"\n", "y_labels = [\n", - " 'GHI ($\\mathdefault{W/m^2}$)', \\\n", - " 'Cloud type',\n", - " 'Active power (kW)', ]\n", + " \"GHI ($\\mathdefault{W/m^2}$)\",\n", + " \"Cloud type\",\n", + " \"Active power (kW)\",\n", + "]\n", "plt_config = {\n", - " 'GHI': [0, 0, '-', None, None],\n", - " 'cloud_type': [0, 1, '-', None, None],\n", - " # 'DNI': [0, 0, '-', None, None],\n", - " 'active_power': [1, 0, '-', None, None],\n", - " }\n", - "color_nights=False\n", + " \"GHI\": [0, 0, \"-\", None, None],\n", + " \"cloud_type\": [0, 1, \"-\", None, None],\n", + " # 'DNI': [0, 0, '-', None, None],\n", + " \"active_power\": [1, 0, \"-\", None, None],\n", + "}\n", + "color_nights = False\n", "# color_by = 'group'\n", - "color_by = 'attribute'\n", - "ax_digit = '1.2f'\n", - "a=my_plot4(start_time, end_time, df8, plt_config=plt_config, ax_digit= ax_digit,\n", - " group_attr='edp_site_id', time_attr='time', color_nights=color_nights,cmap='turbo',\n", - " figsize=[14/2.54,1.25], same_scale=1, fontsize=7, fontname='Times New Roman', plot_total=False, plot_total_func=['sum', [lambda x: max(x), 'max']], \n", - " num_ticks=num_ticks, num_yticks=7, dpi=300, x_format= '%H:%M', \n", - " legend_loc=['center left', 'center right', 'center left'], x_label=x_label, y_labels=y_labels, color_by=color_by,\n", - "plot_period=np.timedelta64(1, 'D'), save_as=save_as, hspace=0,rotation = 60, step=0, gridwidth=[0.2, .2], legend_join='-', title='', legend_i=0, title_i=0, only1title=1, onlyntime=1)\n", + "color_by = \"attribute\"\n", + "ax_digit = \"1.2f\"\n", + "a = my_plot4(\n", + " start_time,\n", + " end_time,\n", + " df8,\n", + " plt_config=plt_config,\n", + " ax_digit=ax_digit,\n", + " group_attr=\"edp_site_id\",\n", + " time_attr=\"time\",\n", + " color_nights=color_nights,\n", + " cmap=\"turbo\",\n", + " figsize=[14 / 2.54, 1.25],\n", + " same_scale=1,\n", + " fontsize=7,\n", + " fontname=\"Times New Roman\",\n", + " plot_total=False,\n", + " plot_total_func=[\"sum\", [lambda x: max(x), \"max\"]],\n", + " num_ticks=num_ticks,\n", + " num_yticks=7,\n", + " dpi=300,\n", + " x_format=\"%H:%M\",\n", + " legend_loc=[\"center left\", \"center right\", \"center left\"],\n", + " x_label=x_label,\n", + " y_labels=y_labels,\n", + " color_by=color_by,\n", + " plot_period=np.timedelta64(1, \"D\"),\n", + " save_as=save_as,\n", + " hspace=0,\n", + " rotation=60,\n", + " step=0,\n", + " gridwidth=[0.2, 0.2],\n", + " legend_join=\"-\",\n", + " title=\"\",\n", + " legend_i=0,\n", + " title_i=0,\n", + " only1title=1,\n", + " onlyntime=1,\n", + ")\n", "a.do()" ] } diff --git a/tests/EDP_solar_tripislanding.ipynb b/tests/EDP_solar_tripislanding.ipynb index fd91319..b6d2161 100755 --- a/tests/EDP_solar_tripislanding.ipynb +++ b/tests/EDP_solar_tripislanding.ipynb @@ -10,9 +10,11 @@ "%reload_ext autoreload\n", "%autoreload 2\n", "import sys\n", - "sys.path.append('../') \n", - "from visualisation import *\n", - "from epd_process_funcs import *" + "\n", + "sys.path.append(\"../\")\n", + "from epd_process_funcs import *\n", + "\n", + "from visualisation import *" ] }, { @@ -22,12 +24,17 @@ "metadata": {}, "outputs": [], "source": [ - "solar=pd.read_csv('/home/hossein/CICCADA/BOM_NCI/2023/NCI_processed_grouped_01_02.csv')\n", - "solar['time'] = pd.to_datetime(solar['time'])\n", - "solar['time'] = solar['time'].dt.tz_localize('utc')\n", - "solar['time'] = solar['time'].dt.tz_convert(pytz.FixedOffset(9.5*60))\n", - "solar['postcode'] = solar['postcode'].astype(int)\n", - "solar.rename(columns={'surface_global_irradiance': 'GHI', 'direct_normal_irradiance': 'DNI'}, inplace=True)" + "solar = pd.read_csv(\n", + " \"/home/hossein/CICCADA/BOM_NCI/2023/NCI_processed_grouped_01_02.csv\"\n", + ")\n", + "solar[\"time\"] = pd.to_datetime(solar[\"time\"])\n", + "solar[\"time\"] = solar[\"time\"].dt.tz_localize(\"utc\")\n", + "solar[\"time\"] = solar[\"time\"].dt.tz_convert(pytz.FixedOffset(9.5 * 60))\n", + "solar[\"postcode\"] = solar[\"postcode\"].astype(int)\n", + "solar.rename(\n", + " columns={\"surface_global_irradiance\": \"GHI\", \"direct_normal_irradiance\": \"DNI\"},\n", + " inplace=True,\n", + ")" ] }, { @@ -38,8 +45,10 @@ "outputs": [], "source": [ "edp_path = \"4) Data/EDP SA 2023 Data\"\n", - "edp_files = glob(edp_path+\"/SA_site_edp_2023_S*.csv\")\n", - "df = pd.concat([pd.read_csv(i) for i in edp_files if os.path.getsize(i) > 0]).reset_index(drop=True) " + "edp_files = glob(edp_path + \"/SA_site_edp_2023_S*.csv\")\n", + "df = pd.concat(\n", + " [pd.read_csv(i) for i in edp_files if os.path.getsize(i) > 0]\n", + ").reset_index(drop=True)" ] }, { @@ -49,13 +58,26 @@ "metadata": {}, "outputs": [], "source": [ - "meta_data1= pd.read_csv(edp_path+\"/edp_sites_metadata_sa_postcode.csv\")\n", - "meta_data2 = pd.read_csv(edp_path+\"/edp_sites_metadata59239829.csv\")\n", - "meta_data2 = meta_data2[meta_data2['state'] == 'SA']\n", - "meta_data3 = meta_data2.merge(meta_data1[['edp_site_id', 'postcode']], on='edp_site_id', how='left')\n", - "meta_data3['Srated'] = meta_data3['inverter_ac_rating_kw']*meta_data3['inverter_count']\n", - "meta_data3['Srated'] = meta_data3.apply(lambda row: row['inverter_ac_rating_kw'] if pd.isna(row['inverter_count']) else row['Srated'], axis=1)\n", - "meta_data2 = meta_data3.groupby(['edp_site_id', 'postcode']).agg({'Srated':'sum'}).reset_index()" + "meta_data1 = pd.read_csv(edp_path + \"/edp_sites_metadata_sa_postcode.csv\")\n", + "meta_data2 = pd.read_csv(edp_path + \"/edp_sites_metadata59239829.csv\")\n", + "meta_data2 = meta_data2[meta_data2[\"state\"] == \"SA\"]\n", + "meta_data3 = meta_data2.merge(\n", + " meta_data1[[\"edp_site_id\", \"postcode\"]], on=\"edp_site_id\", how=\"left\"\n", + ")\n", + "meta_data3[\"Srated\"] = (\n", + " meta_data3[\"inverter_ac_rating_kw\"] * meta_data3[\"inverter_count\"]\n", + ")\n", + "meta_data3[\"Srated\"] = meta_data3.apply(\n", + " lambda row: (\n", + " row[\"inverter_ac_rating_kw\"]\n", + " if pd.isna(row[\"inverter_count\"])\n", + " else row[\"Srated\"]\n", + " ),\n", + " axis=1,\n", + ")\n", + "meta_data2 = (\n", + " meta_data3.groupby([\"edp_site_id\", \"postcode\"]).agg({\"Srated\": \"sum\"}).reset_index()\n", + ")" ] }, { @@ -65,7 +87,7 @@ "metadata": {}, "outputs": [], "source": [ - "df5 = process_edp(df, meta_data2, 10.5*60)\n" + "df5 = process_edp(df, meta_data2, 10.5 * 60)" ] }, { @@ -76,8 +98,8 @@ "outputs": [], "source": [ "df6 = df5.copy()\n", - "df6['DP'] = df6.groupby(['edp_site_id'])['active_power'].transform(lambda x: x.diff())\n", - "df6['DP_r'] = df6['DP'].clip(upper=0).round()" + "df6[\"DP\"] = df6.groupby([\"edp_site_id\"])[\"active_power\"].transform(lambda x: x.diff())\n", + "df6[\"DP_r\"] = df6[\"DP\"].clip(upper=0).round()" ] }, { @@ -87,7 +109,7 @@ "metadata": {}, "outputs": [], "source": [ - "ii=0" + "ii = 0" ] }, { @@ -109,43 +131,72 @@ } ], "source": [ - "df8 = df6.query(f\"DP_r <= -1 and active_power < .05 and active_power >= -.05 and voltage_avg >= 258\").reset_index(drop=True)\n", + "df8 = df6.query(\n", + " f\"DP_r <= -1 and active_power < .05 and active_power >= -.05 and voltage_avg >= 258\"\n", + ").reset_index(drop=True)\n", "# df8 = df8[df8['time'].dt.hour < 17]\n", - "site_ids = df8['edp_site_id'].unique()[0]\n", - "time_t = df8.query(f\"edp_site_id == '{site_ids}'\")['time'].to_list()[ii]\n", + "site_ids = df8[\"edp_site_id\"].unique()[0]\n", + "time_t = df8.query(f\"edp_site_id == '{site_ids}'\")[\"time\"].to_list()[ii]\n", "# ii= ii + 1\n", "t0 = time_t.replace(hour=0, minute=0, second=0, microsecond=0)\n", "t1 = t0 + pd.Timedelta(days=1)\n", "print(f\"Time of tripping: {time_t}\")\n", "print(f\"site_id: {site_ids}, postcode: {df8['postcode'].unique()[0]}\")\n", "df8 = df6.copy()\n", - "df8 = df8[df8['edp_site_id'] == site_ids].reset_index(drop=True)\n", - "start_time = t0 # In sydney local time\n", - "end_time = t1 # In sydney local time\n", + "df8 = df8[df8[\"edp_site_id\"] == site_ids].reset_index(drop=True)\n", + "start_time = t0 # In sydney local time\n", + "end_time = t1 # In sydney local time\n", "\n", "num_ticks = 73\n", - "save_as = ''\n", - "x_label = 'time'\n", - "y_labels = ['Active power (kW)', \n", - " 'voltage (V)',\n", - " 'GHI', \\\n", - " 'Cloud type']\n", + "save_as = \"\"\n", + "x_label = \"time\"\n", + "y_labels = [\"Active power (kW)\", \"voltage (V)\", \"GHI\", \"Cloud type\"]\n", "plt_config = {\n", - " 'active_power': [0, 0, '-', None, None],\n", - " 'voltage_avg': [0, 1, '-', None, None],\n", - " }\n", + " \"active_power\": [0, 0, \"-\", None, None],\n", + " \"voltage_avg\": [0, 1, \"-\", None, None],\n", + "}\n", "# y_labels = ['Active power (kW)', 'Apparent power (kVA)']\n", "# plt_config = {'active_power': [0, 0, '-', -2.5, 4.5], 'apparent_power': [1, 1, '-', None, None]}\n", - "color_nights=False\n", + "color_nights = False\n", "# color_by = 'group'\n", - "color_by = 'attribute'\n", - "ax_digit = '1.2f'\n", - "a=my_plot4(start_time, end_time, df8, plt_config=plt_config, ax_digit= ax_digit,\n", - " group_attr='edp_site_id', time_attr='time', color_nights=color_nights,cmap='viridis',\n", - " figsize=[17/2.54,2.], same_scale=1, fontsize=7, fontname='Times New Roman', plot_total=False, plot_total_func=['sum', [lambda x: max(x), 'max']], \n", - " num_ticks=num_ticks, num_yticks=10, dpi=300, x_format= '%H:%M', \n", - " legend_loc=['lower left', 'upper right', 'lower left', 'upper right'], x_label=x_label, y_labels=y_labels, color_by=color_by,\n", - "plot_period=np.timedelta64(1, 'D'), save_as=save_as, rotation = 60, step=0, gridwidth=[0.5, .5], legend_join='-', title='', legend_i=0, title_i=0, only1title=1, onlyntime=0)\n", + "color_by = \"attribute\"\n", + "ax_digit = \"1.2f\"\n", + "a = my_plot4(\n", + " start_time,\n", + " end_time,\n", + " df8,\n", + " plt_config=plt_config,\n", + " ax_digit=ax_digit,\n", + " group_attr=\"edp_site_id\",\n", + " time_attr=\"time\",\n", + " color_nights=color_nights,\n", + " cmap=\"viridis\",\n", + " figsize=[17 / 2.54, 2.0],\n", + " same_scale=1,\n", + " fontsize=7,\n", + " fontname=\"Times New Roman\",\n", + " plot_total=False,\n", + " plot_total_func=[\"sum\", [lambda x: max(x), \"max\"]],\n", + " num_ticks=num_ticks,\n", + " num_yticks=10,\n", + " dpi=300,\n", + " x_format=\"%H:%M\",\n", + " legend_loc=[\"lower left\", \"upper right\", \"lower left\", \"upper right\"],\n", + " x_label=x_label,\n", + " y_labels=y_labels,\n", + " color_by=color_by,\n", + " plot_period=np.timedelta64(1, \"D\"),\n", + " save_as=save_as,\n", + " rotation=60,\n", + " step=0,\n", + " gridwidth=[0.5, 0.5],\n", + " legend_join=\"-\",\n", + " title=\"\",\n", + " legend_i=0,\n", + " title_i=0,\n", + " only1title=1,\n", + " onlyntime=0,\n", + ")\n", "a.do()" ] }, @@ -168,45 +219,83 @@ } ], "source": [ - "df8 = df6.query(f\"DP_r < -1 and active_power < .1 and active_power >= -.05 and voltage_avg > 250 and time < '2023-02-28 00:00:00+1030'\").reset_index(drop=True)\n", - "site_ids = df8['edp_site_id'].unique()[0]\n", - "time_t = df8.query(f\"edp_site_id == '{site_ids}'\")['time'].to_list()[0]\n", + "df8 = df6.query(\n", + " f\"DP_r < -1 and active_power < .1 and active_power >= -.05 and voltage_avg > 250 and time < '2023-02-28 00:00:00+1030'\"\n", + ").reset_index(drop=True)\n", + "site_ids = df8[\"edp_site_id\"].unique()[0]\n", + "time_t = df8.query(f\"edp_site_id == '{site_ids}'\")[\"time\"].to_list()[0]\n", "t0 = time_t.replace(hour=0, minute=0, second=0, microsecond=0)\n", "t1 = t0 + pd.Timedelta(days=1)\n", "print(f\"Time of tripping: {time_t}\")\n", "print(f\"site_id: {site_ids}, postcode: {df8['postcode'].unique()[0]}\")\n", "df8 = df6.copy()\n", - "df8 = df8[df8['edp_site_id'] == site_ids].reset_index(drop=True)\n", - "df8 = df8.merge(solar, on=['time', 'postcode'], how='left')\n", - "df8 = df8.sort_values(['postcode', 'time']).groupby('postcode').apply(lambda group: group.set_index('time').interpolate(method='time', limit=1).reset_index()).reset_index(drop=True)\n", - "start_time = t0 # In sydney local time\n", - "end_time = t1 # In sydney local time\n", + "df8 = df8[df8[\"edp_site_id\"] == site_ids].reset_index(drop=True)\n", + "df8 = df8.merge(solar, on=[\"time\", \"postcode\"], how=\"left\")\n", + "df8 = (\n", + " df8.sort_values([\"postcode\", \"time\"])\n", + " .groupby(\"postcode\")\n", + " .apply(\n", + " lambda group: (\n", + " group.set_index(\"time\").interpolate(method=\"time\", limit=1).reset_index()\n", + " )\n", + " )\n", + " .reset_index(drop=True)\n", + ")\n", + "start_time = t0 # In sydney local time\n", + "end_time = t1 # In sydney local time\n", "\n", "num_ticks = 73\n", - "save_as = ''\n", - "x_label = 'time'\n", - "y_labels = ['Active power (kW)', \n", - " 'voltage (V)',\n", - " 'GHI', \\\n", - " 'Cloud type']\n", + "save_as = \"\"\n", + "x_label = \"time\"\n", + "y_labels = [\"Active power (kW)\", \"voltage (V)\", \"GHI\", \"Cloud type\"]\n", "plt_config = {\n", - " 'active_power': [1, 0, '-', None, None],\n", - " 'voltage_avg': [1, 1, '-', None, None],\n", - " 'GHI': [0, 0, '-', None, None],\n", - " 'cloud_type': [0, 1, '-', None, None],\n", - " }\n", + " \"active_power\": [1, 0, \"-\", None, None],\n", + " \"voltage_avg\": [1, 1, \"-\", None, None],\n", + " \"GHI\": [0, 0, \"-\", None, None],\n", + " \"cloud_type\": [0, 1, \"-\", None, None],\n", + "}\n", "# y_labels = ['Active power (kW)', 'Apparent power (kVA)']\n", "# plt_config = {'active_power': [0, 0, '-', -2.5, 4.5], 'apparent_power': [1, 1, '-', None, None]}\n", - "color_nights=False\n", + "color_nights = False\n", "# color_by = 'group'\n", - "color_by = 'attribute'\n", - "ax_digit = '1.2f'\n", - "a=my_plot4(start_time, end_time, df8, plt_config=plt_config, ax_digit= ax_digit,\n", - " group_attr='edp_site_id', time_attr='time', color_nights=color_nights,cmap='viridis',\n", - " figsize=[17/2.54,2.], same_scale=1, fontsize=7, fontname='Times New Roman', plot_total=False, plot_total_func=['sum', [lambda x: max(x), 'max']], \n", - " num_ticks=num_ticks, num_yticks=10, dpi=300, x_format= '%H:%M', \n", - " legend_loc=['lower left', 'upper right', 'lower left', 'upper right'], x_label=x_label, y_labels=y_labels, color_by=color_by,\n", - "plot_period=np.timedelta64(1, 'D'), save_as=save_as, rotation = 60, step=0, gridwidth=[0.5, .5], legend_join='-', title='', legend_i=0, title_i=0, only1title=1, onlyntime=0)\n", + "color_by = \"attribute\"\n", + "ax_digit = \"1.2f\"\n", + "a = my_plot4(\n", + " start_time,\n", + " end_time,\n", + " df8,\n", + " plt_config=plt_config,\n", + " ax_digit=ax_digit,\n", + " group_attr=\"edp_site_id\",\n", + " time_attr=\"time\",\n", + " color_nights=color_nights,\n", + " cmap=\"viridis\",\n", + " figsize=[17 / 2.54, 2.0],\n", + " same_scale=1,\n", + " fontsize=7,\n", + " fontname=\"Times New Roman\",\n", + " plot_total=False,\n", + " plot_total_func=[\"sum\", [lambda x: max(x), \"max\"]],\n", + " num_ticks=num_ticks,\n", + " num_yticks=10,\n", + " dpi=300,\n", + " x_format=\"%H:%M\",\n", + " legend_loc=[\"lower left\", \"upper right\", \"lower left\", \"upper right\"],\n", + " x_label=x_label,\n", + " y_labels=y_labels,\n", + " color_by=color_by,\n", + " plot_period=np.timedelta64(1, \"D\"),\n", + " save_as=save_as,\n", + " rotation=60,\n", + " step=0,\n", + " gridwidth=[0.5, 0.5],\n", + " legend_join=\"-\",\n", + " title=\"\",\n", + " legend_i=0,\n", + " title_i=0,\n", + " only1title=1,\n", + " onlyntime=0,\n", + ")\n", "a.do()" ] } diff --git a/tests/EDP_voltvar.ipynb b/tests/EDP_voltvar.ipynb index d8276bd..d44fbdf 100755 --- a/tests/EDP_voltvar.ipynb +++ b/tests/EDP_voltvar.ipynb @@ -10,9 +10,11 @@ "%reload_ext autoreload\n", "%autoreload 2\n", "import sys\n", - "sys.path.append('../') \n", - "from visualisation import *\n", - "from epd_process_funcs import *" + "\n", + "sys.path.append(\"../\")\n", + "from epd_process_funcs import *\n", + "\n", + "from visualisation import *" ] }, { @@ -23,15 +25,30 @@ "outputs": [], "source": [ "edp_path = \"4) Data/EDP SA 2023 Data\"\n", - "edp_files = glob(edp_path+\"/SA_site_edp_2023_S*.csv\")\n", - "df = pd.concat([pd.read_csv(i) for i in edp_files if os.path.getsize(i) > 0]).reset_index(drop=True) \n", - "meta_data1= pd.read_csv(edp_path+\"/edp_sites_metadata_sa_postcode.csv\")\n", - "meta_data2 = pd.read_csv(edp_path+\"/edp_sites_metadata59239829.csv\")\n", - "meta_data2 = meta_data2[meta_data2['state'] == 'SA']\n", - "meta_data3 = meta_data2.merge(meta_data1[['edp_site_id', 'postcode']], on='edp_site_id', how='left')\n", - "meta_data3['Srated'] = meta_data3['inverter_ac_rating_kw']*meta_data3['inverter_count']\n", - "meta_data3['Srated'] = meta_data3.apply(lambda row: row['inverter_ac_rating_kw'] if pd.isna(row['inverter_count']) else row['Srated'], axis=1)\n", - "meta_data2 = meta_data3.groupby(['edp_site_id', 'postcode']).agg({'Srated':'sum'}).reset_index()" + "edp_files = glob(edp_path + \"/SA_site_edp_2023_S*.csv\")\n", + "df = pd.concat(\n", + " [pd.read_csv(i) for i in edp_files if os.path.getsize(i) > 0]\n", + ").reset_index(drop=True)\n", + "meta_data1 = pd.read_csv(edp_path + \"/edp_sites_metadata_sa_postcode.csv\")\n", + "meta_data2 = pd.read_csv(edp_path + \"/edp_sites_metadata59239829.csv\")\n", + "meta_data2 = meta_data2[meta_data2[\"state\"] == \"SA\"]\n", + "meta_data3 = meta_data2.merge(\n", + " meta_data1[[\"edp_site_id\", \"postcode\"]], on=\"edp_site_id\", how=\"left\"\n", + ")\n", + "meta_data3[\"Srated\"] = (\n", + " meta_data3[\"inverter_ac_rating_kw\"] * meta_data3[\"inverter_count\"]\n", + ")\n", + "meta_data3[\"Srated\"] = meta_data3.apply(\n", + " lambda row: (\n", + " row[\"inverter_ac_rating_kw\"]\n", + " if pd.isna(row[\"inverter_count\"])\n", + " else row[\"Srated\"]\n", + " ),\n", + " axis=1,\n", + ")\n", + "meta_data2 = (\n", + " meta_data3.groupby([\"edp_site_id\", \"postcode\"]).agg({\"Srated\": \"sum\"}).reset_index()\n", + ")" ] }, { @@ -41,7 +58,7 @@ "metadata": {}, "outputs": [], "source": [ - "df5 = process_edp(df, meta_data2, 10.5*60)\n" + "df5 = process_edp(df, meta_data2, 10.5 * 60)" ] }, { @@ -51,7 +68,7 @@ "metadata": {}, "outputs": [], "source": [ - "ii=0" + "ii = 0" ] }, { @@ -82,38 +99,78 @@ } ], "source": [ - "df8 = df5.query(f\"Q_noncomp > .2 and voltage_avg > 250 and reactive_power < -.5\").reset_index(drop=True)\n", - "site_ids = df8['edp_site_id'].unique()[0]\n", - "time_t = df8.query(f\"edp_site_id == '{site_ids}'\")['time'].to_list()[ii]\n", - "ii+=1\n", + "df8 = df5.query(\n", + " f\"Q_noncomp > .2 and voltage_avg > 250 and reactive_power < -.5\"\n", + ").reset_index(drop=True)\n", + "site_ids = df8[\"edp_site_id\"].unique()[0]\n", + "time_t = df8.query(f\"edp_site_id == '{site_ids}'\")[\"time\"].to_list()[ii]\n", + "ii += 1\n", "t0 = time_t.replace(hour=0, minute=0, second=0, microsecond=0)\n", "t1 = t0 + pd.Timedelta(days=1)\n", "print(f\"Volt-var non-compliance\")\n", "print(f\"Time: {time_t}\")\n", "print(f\"site_id: {site_ids}, postcode: {df8['postcode'].unique()[0]}\")\n", "df8 = df5.copy()\n", - "df8 = df8[df8['edp_site_id'] == site_ids].reset_index(drop=True)\n", - "start_time = t0 # In sydney local time\n", - "end_time = t1 # In sydney local time\n", + "df8 = df8[df8[\"edp_site_id\"] == site_ids].reset_index(drop=True)\n", + "start_time = t0 # In sydney local time\n", + "end_time = t1 # In sydney local time\n", "\n", "\n", - "num_ticks = 24*2+1\n", - "save_as = 'Figures/EDP_voltvar.jpeg'\n", - "x_label = 'time'\n", - "y_labels = ['Reactive power (kvar)', 'Reactive power (kvar)','Reactive power (kvar)', 'voltage (V)']\n", - "plt_config = {'Q_voltvar_max': [0, 0, '-.', None, None], 'Q_voltvar_min': [0, 0, '-.', None, None], 'reactive_power': [0, 0, '-', None, None],\n", - "'voltage_avg': [0, 1, '-', None, None]}\n", + "num_ticks = 24 * 2 + 1\n", + "save_as = \"Figures/EDP_voltvar.jpeg\"\n", + "x_label = \"time\"\n", + "y_labels = [\n", + " \"Reactive power (kvar)\",\n", + " \"Reactive power (kvar)\",\n", + " \"Reactive power (kvar)\",\n", + " \"voltage (V)\",\n", + "]\n", + "plt_config = {\n", + " \"Q_voltvar_max\": [0, 0, \"-.\", None, None],\n", + " \"Q_voltvar_min\": [0, 0, \"-.\", None, None],\n", + " \"reactive_power\": [0, 0, \"-\", None, None],\n", + " \"voltage_avg\": [0, 1, \"-\", None, None],\n", + "}\n", "\n", - "color_nights=False\n", + "color_nights = False\n", "# color_by = 'group'\n", - "color_by = 'attribute'\n", - "ax_digit = '1.1f'\n", - "a=my_plot4(start_time, end_time, df8, plt_config=plt_config, ax_digit= ax_digit,\n", - " group_attr='edp_site_id', time_attr='time', color_nights=color_nights,cmap='viridis',\n", - " figsize=[14/2.54,2], same_scale=1, fontsize=7, fontname='Times New Roman', plot_total=False, plot_total_func=['sum', [lambda x: max(x), 'max']], \n", - " num_ticks=num_ticks, num_yticks=10, dpi=300, x_format= '%H:%M', \n", - " legend_loc=['lower left', 'center right', 'upper right'], x_label=x_label, y_labels=y_labels, color_by=color_by,\n", - "plot_period=np.timedelta64(1, 'D'), save_as=save_as, rotation = 60, step=0, gridwidth=[0.2, .2], legend_join='-', title='', legend_i=0, title_i=0, only1title=0)\n", + "color_by = \"attribute\"\n", + "ax_digit = \"1.1f\"\n", + "a = my_plot4(\n", + " start_time,\n", + " end_time,\n", + " df8,\n", + " plt_config=plt_config,\n", + " ax_digit=ax_digit,\n", + " group_attr=\"edp_site_id\",\n", + " time_attr=\"time\",\n", + " color_nights=color_nights,\n", + " cmap=\"viridis\",\n", + " figsize=[14 / 2.54, 2],\n", + " same_scale=1,\n", + " fontsize=7,\n", + " fontname=\"Times New Roman\",\n", + " plot_total=False,\n", + " plot_total_func=[\"sum\", [lambda x: max(x), \"max\"]],\n", + " num_ticks=num_ticks,\n", + " num_yticks=10,\n", + " dpi=300,\n", + " x_format=\"%H:%M\",\n", + " legend_loc=[\"lower left\", \"center right\", \"upper right\"],\n", + " x_label=x_label,\n", + " y_labels=y_labels,\n", + " color_by=color_by,\n", + " plot_period=np.timedelta64(1, \"D\"),\n", + " save_as=save_as,\n", + " rotation=60,\n", + " step=0,\n", + " gridwidth=[0.2, 0.2],\n", + " legend_join=\"-\",\n", + " title=\"\",\n", + " legend_i=0,\n", + " title_i=0,\n", + " only1title=0,\n", + ")\n", "a.do()" ] }, @@ -135,7 +192,7 @@ } ], "source": [ - "5*.05" + "5 * 0.05" ] }, { diff --git a/tests/EDP_voltwatt.ipynb b/tests/EDP_voltwatt.ipynb index d918820..c2fcd9f 100755 --- a/tests/EDP_voltwatt.ipynb +++ b/tests/EDP_voltwatt.ipynb @@ -10,9 +10,11 @@ "%reload_ext autoreload\n", "%autoreload 2\n", "import sys\n", - "sys.path.append('../') \n", - "from visualisation import *\n", - "from epd_process_funcs import *" + "\n", + "sys.path.append(\"../\")\n", + "from epd_process_funcs import *\n", + "\n", + "from visualisation import *" ] }, { @@ -23,15 +25,30 @@ "outputs": [], "source": [ "edp_path = \"4) Data/EDP SA 2023 Data\"\n", - "edp_files = glob(edp_path+\"/SA_site_edp_2023_S*.csv\")\n", - "df = pd.concat([pd.read_csv(i) for i in edp_files if os.path.getsize(i) > 0]).reset_index(drop=True) \n", - "meta_data1= pd.read_csv(edp_path+\"/edp_sites_metadata_sa_postcode.csv\")\n", - "meta_data2 = pd.read_csv(edp_path+\"/edp_sites_metadata59239829.csv\")\n", - "meta_data2 = meta_data2[meta_data2['state'] == 'SA']\n", - "meta_data3 = meta_data2.merge(meta_data1[['edp_site_id', 'postcode']], on='edp_site_id', how='left')\n", - "meta_data3['Srated'] = meta_data3['inverter_ac_rating_kw']*meta_data3['inverter_count']\n", - "meta_data3['Srated'] = meta_data3.apply(lambda row: row['inverter_ac_rating_kw'] if pd.isna(row['inverter_count']) else row['Srated'], axis=1)\n", - "meta_data2 = meta_data3.groupby(['edp_site_id', 'postcode']).agg({'Srated':'sum'}).reset_index()" + "edp_files = glob(edp_path + \"/SA_site_edp_2023_S*.csv\")\n", + "df = pd.concat(\n", + " [pd.read_csv(i) for i in edp_files if os.path.getsize(i) > 0]\n", + ").reset_index(drop=True)\n", + "meta_data1 = pd.read_csv(edp_path + \"/edp_sites_metadata_sa_postcode.csv\")\n", + "meta_data2 = pd.read_csv(edp_path + \"/edp_sites_metadata59239829.csv\")\n", + "meta_data2 = meta_data2[meta_data2[\"state\"] == \"SA\"]\n", + "meta_data3 = meta_data2.merge(\n", + " meta_data1[[\"edp_site_id\", \"postcode\"]], on=\"edp_site_id\", how=\"left\"\n", + ")\n", + "meta_data3[\"Srated\"] = (\n", + " meta_data3[\"inverter_ac_rating_kw\"] * meta_data3[\"inverter_count\"]\n", + ")\n", + "meta_data3[\"Srated\"] = meta_data3.apply(\n", + " lambda row: (\n", + " row[\"inverter_ac_rating_kw\"]\n", + " if pd.isna(row[\"inverter_count\"])\n", + " else row[\"Srated\"]\n", + " ),\n", + " axis=1,\n", + ")\n", + "meta_data2 = (\n", + " meta_data3.groupby([\"edp_site_id\", \"postcode\"]).agg({\"Srated\": \"sum\"}).reset_index()\n", + ")" ] }, { @@ -41,7 +58,7 @@ "metadata": {}, "outputs": [], "source": [ - "df5 = process_edp(df, meta_data2, 10.5*60)\n" + "df5 = process_edp(df, meta_data2, 10.5 * 60)" ] }, { @@ -51,7 +68,7 @@ "metadata": {}, "outputs": [], "source": [ - "ii=11" + "ii = 11" ] }, { @@ -83,8 +100,8 @@ ], "source": [ "df8 = df5.query(f\"P_noncomp > 0 and voltage_avg > 250\").reset_index(drop=True)\n", - "site_ids = df8['edp_site_id'].unique()[0]\n", - "time_t = df8.query(f\"edp_site_id == '{site_ids}'\")['time'].to_list()[ii]\n", + "site_ids = df8[\"edp_site_id\"].unique()[0]\n", + "time_t = df8.query(f\"edp_site_id == '{site_ids}'\")[\"time\"].to_list()[ii]\n", "# ii+=1\n", "t0 = time_t.replace(hour=0, minute=0, second=0, microsecond=0)\n", "t1 = t0 + pd.Timedelta(days=1)\n", @@ -92,28 +109,60 @@ "print(f\"Time: {time_t}\")\n", "print(f\"site_id: {site_ids}, postcode: {df8['postcode'].unique()[0]}\")\n", "df8 = df5.copy()\n", - "df8 = df8[df8['edp_site_id'] == site_ids].reset_index(drop=True)\n", - "start_time = t0 # In sydney local time\n", - "end_time = t1 # In sydney local time\n", + "df8 = df8[df8[\"edp_site_id\"] == site_ids].reset_index(drop=True)\n", + "start_time = t0 # In sydney local time\n", + "end_time = t1 # In sydney local time\n", "\n", "\n", - "num_ticks = 24*2+1\n", - "save_as = 'Figures/EDP_voltwatt.jpeg'\n", - "x_label = 'time'\n", - "y_labels = ['Active power (kW)', 'Active power (kW)', 'voltage (V)']\n", - "plt_config = {'P_threshold': [0, 0, '-.', None, None], 'active_power': [0, 0, '-', None, None],\n", - "'voltage_avg': [0, 1, '-', None, None]}\n", + "num_ticks = 24 * 2 + 1\n", + "save_as = \"Figures/EDP_voltwatt.jpeg\"\n", + "x_label = \"time\"\n", + "y_labels = [\"Active power (kW)\", \"Active power (kW)\", \"voltage (V)\"]\n", + "plt_config = {\n", + " \"P_threshold\": [0, 0, \"-.\", None, None],\n", + " \"active_power\": [0, 0, \"-\", None, None],\n", + " \"voltage_avg\": [0, 1, \"-\", None, None],\n", + "}\n", "\n", - "color_nights=False\n", + "color_nights = False\n", "# color_by = 'group'\n", - "color_by = 'attribute'\n", - "ax_digit = '1.1f'\n", - "a=my_plot4(start_time, end_time, df8, plt_config=plt_config, ax_digit= ax_digit,\n", - " group_attr='edp_site_id', time_attr='time', color_nights=color_nights,cmap='viridis',\n", - " figsize=[14/2.54,2], same_scale=1, fontsize=7, fontname='Times New Roman', plot_total=False, plot_total_func=['sum', [lambda x: max(x), 'max']], \n", - " num_ticks=num_ticks, num_yticks=10, dpi=300, x_format= '%H:%M', \n", - " legend_loc=['upper left', 'upper right', 'upper right'], x_label=x_label, y_labels=y_labels, color_by=color_by,\n", - "plot_period=np.timedelta64(1, 'D'), save_as=save_as, rotation = 60, step=0, gridwidth=[0.2, .2], legend_join='-', title='', legend_i=0, title_i=0, only1title=0)\n", + "color_by = \"attribute\"\n", + "ax_digit = \"1.1f\"\n", + "a = my_plot4(\n", + " start_time,\n", + " end_time,\n", + " df8,\n", + " plt_config=plt_config,\n", + " ax_digit=ax_digit,\n", + " group_attr=\"edp_site_id\",\n", + " time_attr=\"time\",\n", + " color_nights=color_nights,\n", + " cmap=\"viridis\",\n", + " figsize=[14 / 2.54, 2],\n", + " same_scale=1,\n", + " fontsize=7,\n", + " fontname=\"Times New Roman\",\n", + " plot_total=False,\n", + " plot_total_func=[\"sum\", [lambda x: max(x), \"max\"]],\n", + " num_ticks=num_ticks,\n", + " num_yticks=10,\n", + " dpi=300,\n", + " x_format=\"%H:%M\",\n", + " legend_loc=[\"upper left\", \"upper right\", \"upper right\"],\n", + " x_label=x_label,\n", + " y_labels=y_labels,\n", + " color_by=color_by,\n", + " plot_period=np.timedelta64(1, \"D\"),\n", + " save_as=save_as,\n", + " rotation=60,\n", + " step=0,\n", + " gridwidth=[0.2, 0.2],\n", + " legend_join=\"-\",\n", + " title=\"\",\n", + " legend_i=0,\n", + " title_i=0,\n", + " only1title=0,\n", + ")\n", "a.do()" ] }, diff --git a/tests/clear_sky_day_analysis.ipynb b/tests/clear_sky_day_analysis.ipynb index 87c9fad..edfdc96 100755 --- a/tests/clear_sky_day_analysis.ipynb +++ b/tests/clear_sky_day_analysis.ipynb @@ -7,8 +7,9 @@ "outputs": [], "source": [ "import sys\n", - "sys.path.append('../') \n", - "from visualisation import *\n" + "\n", + "sys.path.append(\"../\")\n", + "from visualisation import *" ] }, { @@ -29,17 +30,17 @@ } ], "source": [ - "df=pd.read_csv('/mnt/d/bom_nci/2023/NCI_processed_Adelaide_grouped.csv')\n", + "df = pd.read_csv(\"/mnt/d/bom_nci/2023/NCI_processed_Adelaide_grouped.csv\")\n", "# df = df.query(f\"postcode == 5007\")\n", "\n", - "df['time'] = pd.to_datetime(df['time'])\n", - "df['time'] = df['time'].dt.tz_localize('utc')\n", - "df['time'] = df['time'].dt.tz_convert(pytz.FixedOffset(10*60))\n", + "df[\"time\"] = pd.to_datetime(df[\"time\"])\n", + "df[\"time\"] = df[\"time\"].dt.tz_localize(\"utc\")\n", + "df[\"time\"] = df[\"time\"].dt.tz_convert(pytz.FixedOffset(10 * 60))\n", "# df.set_index('time', inplace=True)\n", "# min_time = df.index.min()\n", "# max_time = df.index.max()\n", - "# full_time_range = pd.date_range(start=min_time, \n", - "# end=max_time, \n", + "# full_time_range = pd.date_range(start=min_time,\n", + "# end=max_time,\n", "# freq='10min')\n", "\n", "# # Reindex and fill missing entries with zero\n", @@ -47,9 +48,9 @@ "# df = df.interpolate(method='linear', limit=1, limit_direction='both')\n", "# df = df.fillna(0)\n", "# df['time'] = df.pop('index')\n", - "df = df.sort_values('time')\n", - "df['groupid'] = ''\n", - "df['time'].min(), df['time'].max()" + "df = df.sort_values(\"time\")\n", + "df[\"groupid\"] = \"\"\n", + "df[\"time\"].min(), df[\"time\"].max()" ] }, { @@ -71,7 +72,7 @@ } ], "source": [ - "df['time'].diff().unique()" + "df[\"time\"].diff().unique()" ] }, { @@ -91,7 +92,7 @@ } ], "source": [ - "df['quality_mask'].unique()" + "df[\"quality_mask\"].unique()" ] }, { @@ -118,25 +119,59 @@ } ], "source": [ - "start_time = '2023-01-01 00:00:00+10:00' # In sydney local time\n", - "end_time = '2023-01-07 00:00:00+10:00' # In sydney local time\n", + "start_time = \"2023-01-01 00:00:00+10:00\" # In sydney local time\n", + "end_time = \"2023-01-07 00:00:00+10:00\" # In sydney local time\n", "\n", "num_ticks = 25\n", - "save_as = 'Figures/GHI_Adelaide_January_2023.jpeg'\n", - "x_label = 'time'\n", - "y_labels = ['surface_global_irradiance', 'cloud_type']\n", - "plt_config = {'surface_global_irradiance': [0, 0, '-', 0, 1200],\n", - "'cloud_type': [0, 1, '-', -0, 8]}\n", + "save_as = \"Figures/GHI_Adelaide_January_2023.jpeg\"\n", + "x_label = \"time\"\n", + "y_labels = [\"surface_global_irradiance\", \"cloud_type\"]\n", + "plt_config = {\n", + " \"surface_global_irradiance\": [0, 0, \"-\", 0, 1200],\n", + " \"cloud_type\": [0, 1, \"-\", -0, 8],\n", + "}\n", "special_legend = []\n", - "color_nights=False\n", + "color_nights = False\n", "kW2MW_attr = []\n", - "ax_digit = '1.1f'\n", - "a=my_plot4(start_time, end_time, df, plt_config=plt_config, ax_digit= ax_digit,\n", - " group_attr='groupid', time_attr='time', color_nights=color_nights,cmap='RdYlBu',\n", - " figsize=[17/2.54,2.], same_scale=False, fontsize=9, fontname='Times New Roman', plot_total=False, plot_total_func=['sum', [lambda x: max(x), 'max']], \n", - " E2P_attr=None, num_ticks=num_ticks, num_yticks=12, dpi=300, special_legend=special_legend, x_format= '%H:%M', MW=False, \n", - " legend_loc=['upper left', 'upper right'], x_label=x_label, y_labels=y_labels, kW2MW_attr=kW2MW_attr,\n", - "plot_period=np.timedelta64(1, 'D'), save_as=save_as, rotation = 25, step=True, gridwidth=[0.5, .5], legend_join='', title='', legend_i=0, title_i=0, only1title=0)\n", + "ax_digit = \"1.1f\"\n", + "a = my_plot4(\n", + " start_time,\n", + " end_time,\n", + " df,\n", + " plt_config=plt_config,\n", + " ax_digit=ax_digit,\n", + " group_attr=\"groupid\",\n", + " time_attr=\"time\",\n", + " color_nights=color_nights,\n", + " cmap=\"RdYlBu\",\n", + " figsize=[17 / 2.54, 2.0],\n", + " same_scale=False,\n", + " fontsize=9,\n", + " fontname=\"Times New Roman\",\n", + " plot_total=False,\n", + " plot_total_func=[\"sum\", [lambda x: max(x), \"max\"]],\n", + " E2P_attr=None,\n", + " num_ticks=num_ticks,\n", + " num_yticks=12,\n", + " dpi=300,\n", + " special_legend=special_legend,\n", + " x_format=\"%H:%M\",\n", + " MW=False,\n", + " legend_loc=[\"upper left\", \"upper right\"],\n", + " x_label=x_label,\n", + " y_labels=y_labels,\n", + " kW2MW_attr=kW2MW_attr,\n", + " plot_period=np.timedelta64(1, \"D\"),\n", + " save_as=save_as,\n", + " rotation=25,\n", + " step=True,\n", + " gridwidth=[0.5, 0.5],\n", + " legend_join=\"\",\n", + " title=\"\",\n", + " legend_i=0,\n", + " title_i=0,\n", + " only1title=0,\n", + ")\n", "a.do()" ] }, @@ -155,10 +190,16 @@ "metadata": {}, "outputs": [], "source": [ - "df_cloud_type = df.groupby(['time', 'postcode']).agg({'cloud_type': 'sum'}).reset_index()\n", - "df_cloud_type = df_cloud_type.groupby('postcode').agg({'cloud_type': lambda x:x[x==0].count()}).reset_index()\n", - "df_cloud_type = df_cloud_type.sort_values('cloud_type', ascending=False)\n", - "df_cloud_type.columns = ['postcode', 'num_clear_days']" + "df_cloud_type = (\n", + " df.groupby([\"time\", \"postcode\"]).agg({\"cloud_type\": \"sum\"}).reset_index()\n", + ")\n", + "df_cloud_type = (\n", + " df_cloud_type.groupby(\"postcode\")\n", + " .agg({\"cloud_type\": lambda x: x[x == 0].count()})\n", + " .reset_index()\n", + ")\n", + "df_cloud_type = df_cloud_type.sort_values(\"cloud_type\", ascending=False)\n", + "df_cloud_type.columns = [\"postcode\", \"num_clear_days\"]" ] }, { @@ -178,7 +219,7 @@ } ], "source": [ - "9457/6/8" + "9457 / 6 / 8" ] }, { diff --git a/tests/epd_process_funcs.py b/tests/epd_process_funcs.py index 0d81853..0e6601f 100755 --- a/tests/epd_process_funcs.py +++ b/tests/epd_process_funcs.py @@ -1,56 +1,90 @@ import pandas as pd import pytz + + def get_max_P(V, Srated=1, v1=253, v2=260): if V < v1: return Srated elif V > v2: - return .2 * Srated + return 0.2 * Srated else: - m = (Srated - .2*Srated) / (v1 - v2) - P = m * (V - v2) + .2*Srated + m = (Srated - 0.2 * Srated) / (v1 - v2) + P = m * (V - v2) + 0.2 * Srated return P - -def get_voltvar_Q(V, P, Srated=1, v1=207, v2=220, v3=240, v4=258, Q1=.44, Q4=.60): + + +def get_voltvar_Q(V, P, Srated=1, v1=207, v2=220, v3=240, v4=258, Q1=0.44, Q4=0.60): if V <= v1: - Q = Q1* Srated + Q = Q1 * Srated elif v1 <= V < v2: - m = (Q1* Srated - 0) / (v1 - v2) + m = (Q1 * Srated - 0) / (v1 - v2) Q = m * (V - v2) elif v2 <= V <= v3: Q = 0 elif v3 < V < v4: - m = (0 - Q4* Srated) / (v3 - v4) - Q = -m * (V - v4) - Q4* Srated + m = (0 - Q4 * Srated) / (v3 - v4) + Q = -m * (V - v4) - Q4 * Srated else: # V >= v4 - Q = - Q4* Srated + Q = -Q4 * Srated return Q -def process_edp(df, meta_data, offset=9.5*60): + +def process_edp(df, meta_data, offset=9.5 * 60): df5 = df.query("circuit_label == 'pv_site_net'") - df5.loc[df5['voltage_avg'] > 300, 'voltage_avg'] = None + df5.loc[df5["voltage_avg"] > 300, "voltage_avg"] = None # df5.loc[df5['voltage_avg'] > 300, 'voltage_avg'] = df5['voltage_avg']/1000 - df5 = df5.merge(meta_data, on='edp_site_id', how='left') - df5 = df5[['edp_site_id', 'datetime', 'real_energy', - 'reactive_energy', 'current_avg', 'voltage_avg', - 'postcode', 'Srated']] - df5['datetime'] = pd.to_datetime(df5['datetime']) - df5['time'] = df5.pop('datetime') - df5['time'] = df5['time'].dt.tz_localize(pytz.FixedOffset(offset)) - df5 = df5.groupby(['time', 'edp_site_id', 'postcode', 'Srated']).agg({'real_energy': 'sum', 'reactive_energy': 'sum', - 'current_avg': 'sum', 'voltage_avg': 'mean'}).reset_index() - df5['active_power'] = df5['real_energy']*12/1000 - df5['reactive_power'] = df5['reactive_energy']*12/1000 - df5['apparent_power'] = df5['active_power']**2 + df5['reactive_power']**2 - df5['apparent_power'] = df5['apparent_power']**.5 - df5['P_threshold'] = df5.apply(lambda row: get_max_P(row['voltage_avg'], Srated=row['Srated'], v1=253, v2=260), axis=1) - df5['P_noncomp'] = df5['active_power'] - df5['P_threshold'] - df5['P_noncomp'] = df5['P_noncomp'].clip(lower=0) - df5['Q_voltvar'] =df5.apply(lambda row: get_voltvar_Q(row['voltage_avg'], Srated=row['Srated']), axis=1) - df5['Q_voltvar_max'] = df5['Q_voltvar'] + df5['Srated'] * 0.05 - df5['Q_voltvar_min'] = df5['Q_voltvar'] - df5['Srated'] * 0.05 - df5['Q_noncomp'] = ((df5['reactive_power'] - df5['Q_voltvar']).abs() - df5['Srated'] * 0.05).abs() - wrong_meta_data_based_on_maxP = df5.query(f"active_power > Srated")['edp_site_id'].unique() - df5['wrong_on_maxP'] = False - df5.loc[df5['edp_site_id'].isin(wrong_meta_data_based_on_maxP) == True, 'wrong_on_maxP'] = True - df5 = df5.sort_values(by=['edp_site_id', 'time']).reset_index(drop=True) - return df5 \ No newline at end of file + df5 = df5.merge(meta_data, on="edp_site_id", how="left") + df5 = df5[ + [ + "edp_site_id", + "datetime", + "real_energy", + "reactive_energy", + "current_avg", + "voltage_avg", + "postcode", + "Srated", + ] + ] + df5["datetime"] = pd.to_datetime(df5["datetime"]) + df5["time"] = df5.pop("datetime") + df5["time"] = df5["time"].dt.tz_localize(pytz.FixedOffset(offset)) + df5 = ( + df5.groupby(["time", "edp_site_id", "postcode", "Srated"]) + .agg( + { + "real_energy": "sum", + "reactive_energy": "sum", + "current_avg": "sum", + "voltage_avg": "mean", + } + ) + .reset_index() + ) + df5["active_power"] = df5["real_energy"] * 12 / 1000 + df5["reactive_power"] = df5["reactive_energy"] * 12 / 1000 + df5["apparent_power"] = df5["active_power"] ** 2 + df5["reactive_power"] ** 2 + df5["apparent_power"] = df5["apparent_power"] ** 0.5 + df5["P_threshold"] = df5.apply( + lambda row: get_max_P(row["voltage_avg"], Srated=row["Srated"], v1=253, v2=260), + axis=1, + ) + df5["P_noncomp"] = df5["active_power"] - df5["P_threshold"] + df5["P_noncomp"] = df5["P_noncomp"].clip(lower=0) + df5["Q_voltvar"] = df5.apply( + lambda row: get_voltvar_Q(row["voltage_avg"], Srated=row["Srated"]), axis=1 + ) + df5["Q_voltvar_max"] = df5["Q_voltvar"] + df5["Srated"] * 0.05 + df5["Q_voltvar_min"] = df5["Q_voltvar"] - df5["Srated"] * 0.05 + df5["Q_noncomp"] = ( + (df5["reactive_power"] - df5["Q_voltvar"]).abs() - df5["Srated"] * 0.05 + ).abs() + wrong_meta_data_based_on_maxP = df5.query(f"active_power > Srated")[ + "edp_site_id" + ].unique() + df5["wrong_on_maxP"] = False + df5.loc[ + df5["edp_site_id"].isin(wrong_meta_data_based_on_maxP) == True, "wrong_on_maxP" + ] = True + df5 = df5.sort_values(by=["edp_site_id", "time"]).reset_index(drop=True) + return df5 diff --git a/tests/test_clear_sky_days.py b/tests/test_clear_sky_days.py index 720fd15..17e3ca4 100755 --- a/tests/test_clear_sky_days.py +++ b/tests/test_clear_sky_days.py @@ -1,7 +1,6 @@ from pathlib import Path import pandas as pd - from ciccada.clear_sky_days import detect_clear_sky_day diff --git a/visualisation.py b/visualisation.py index 772b897..0a8c990 100755 --- a/visualisation.py +++ b/visualisation.py @@ -1,46 +1,86 @@ -import subprocess -import numpy as np import datetime -import matplotlib.pyplot as plt -import pandas as pd import json import os -from IPython.display import display, HTML -import json -from glob import glob -from colorama import Fore +import random import subprocess import sys import warnings +from collections import defaultdict +from datetime import timedelta +from glob import glob +from itertools import cycle from time import sleep -import random -import pytz + +import matplotlib import matplotlib.dates as mdates -from datetime import timedelta +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import pytz +from colorama import Fore +from IPython.display import HTML, display +from matplotlib.colors import LinearSegmentedColormap from matplotlib.dates import date2num -import matplotlib from matplotlib.gridspec import GridSpec from matplotlib.ticker import FuncFormatter -from matplotlib.colors import LinearSegmentedColormap -from itertools import cycle -from collections import defaultdict -warnings.filterwarnings('ignore') +warnings.filterwarnings("ignore") -class my_plot4(): - def __init__(self,t0, t1, df, plt_config, ax_digit, x_label, y_labels, x_format='%a-%m-%d %H:%M', - group_attr='groupid', time_attr='time', E2P_attr = None, plot_total=False, - plot_total_func=['sum', 'mean'], plot_period=False, cmap='plasma', - legend_loc='center right', num_ticks=12, num_yticks=12, save_as='', figsize=[12,1], fontsize=10, color_nights=True, - fontname = 'times new roman', kW2MW_attr = [], color_by='attribute', gridwidth=[1, 1], - dpi=150, hspace= 0.25, special_legend=[], special_group=[], title_i=0, title_j=None, MW=False, same_scale=True, - step=False, show_legends=True, rotation=45, title='Date', legend_join='-', only1title=False, onlyntime=False, lim_legend=None, - legend_i=None, legend_j=None, bbox_inches='tight', show=True): +class my_plot4: + def __init__( + self, + t0, + t1, + df, + plt_config, + ax_digit, + x_label, + y_labels, + x_format="%a-%m-%d %H:%M", + group_attr="groupid", + time_attr="time", + E2P_attr=None, + plot_total=False, + plot_total_func=["sum", "mean"], + plot_period=False, + cmap="plasma", + legend_loc="center right", + num_ticks=12, + num_yticks=12, + save_as="", + figsize=[12, 1], + fontsize=10, + color_nights=True, + fontname="times new roman", + kW2MW_attr=[], + color_by="attribute", + gridwidth=[1, 1], + dpi=150, + hspace=0.25, + special_legend=[], + special_group=[], + title_i=0, + title_j=None, + MW=False, + same_scale=True, + step=False, + show_legends=True, + rotation=45, + title="Date", + legend_join="-", + only1title=False, + onlyntime=False, + lim_legend=None, + legend_i=None, + legend_j=None, + bbox_inches="tight", + show=True, + ): self.t0_lc_str = t0 self.t1_lc_str = t1 - self.t0_lc_dt = pd.to_datetime(t0, format='%Y-%m-%d %H:%M:%S%f%z') - self.t1_lc_dt = pd.to_datetime(t1, format='%Y-%m-%d %H:%M:%S%f%z') + self.t0_lc_dt = pd.to_datetime(t0, format="%Y-%m-%d %H:%M:%S%f%z") + self.t1_lc_dt = pd.to_datetime(t1, format="%Y-%m-%d %H:%M:%S%f%z") self.df = df.copy() self.only1title = only1title self.onlyntime = onlyntime @@ -50,8 +90,13 @@ def __init__(self,t0, t1, df, plt_config, ax_digit, x_label, y_labels, x_format= self.time_attr = time_attr self.x_label = x_label self.y_labels = y_labels - self.df = self.df.query(f'{self.time_attr} >= \'{self.t0_lc_str}\' and {self.time_attr} < \'{self.t1_lc_str}\'').sort_values( - by=self.time_attr).reset_index(drop=True) + self.df = ( + self.df.query( + f"{self.time_attr} >= '{self.t0_lc_str}' and {self.time_attr} < '{self.t1_lc_str}'" + ) + .sort_values(by=self.time_attr) + .reset_index(drop=True) + ) self.plt_period = plot_period self.plt_attr = list(plt_config.keys()) self.group_attr = group_attr @@ -63,20 +108,20 @@ def __init__(self,t0, t1, df, plt_config, ax_digit, x_label, y_labels, x_format= self.legend_j = legend_j self.bbox_inches = bbox_inches self.spcl_lgnd = special_legend - self.plt_loc = np.asarray(np.asarray(list(plt_config.values()))[:, :2], dtype='float32') + self.plt_loc = np.asarray( + np.asarray(list(plt_config.values()))[:, :2], dtype="float32" + ) self.n_ticks = num_ticks self.n_yticks = num_yticks self.save_path = save_as self.spcl_g = special_group self.hspace = hspace - self.title=title + self.title = title self.legend_join = legend_join - self.font = {'family': fontname, - 'weight': 'normal', - 'size' : fontsize} + self.font = {"family": fontname, "weight": "normal", "size": fontsize} self.dpi = dpi self.gridwidth = gridwidth - self.color_by=color_by + self.color_by = color_by self.title_i = title_i self.title_j = title_j self.figsize = figsize @@ -90,18 +135,31 @@ def __init__(self,t0, t1, df, plt_config, ax_digit, x_label, y_labels, x_format= self.g_titles = self.get_g_titles() cmap = matplotlib.colormaps[cmap] # cmap = remove_yellow(cmap) - if self.color_by == 'group': + if self.color_by == "group": self.colors = cmap(np.linspace(0, 1, len(self.g_titles))) else: - self.colors = cmap(np.linspace(0, 1, len(self.g_titles)*len(self.plt_attr))) + self.colors = cmap( + np.linspace(0, 1, len(self.g_titles) * len(self.plt_attr)) + ) # self.colors = cmap(np.linspace(0, 1, len(self.g_titles)*len(self.plt_attr))) if self.plt_t: - df_tot = self.df.groupby(self.time_attr).agg({self.plt_attr[i]: self.plt_t_func[i] for i in range(len(self.plt_attr))}).reset_index() - df_tot[self.group_attr] = 'All' - self.df = pd.concat([self.df, df_tot], axis=0, ignore_index=True).sort_values(by=self.time_attr) + df_tot = ( + self.df.groupby(self.time_attr) + .agg( + { + self.plt_attr[i]: self.plt_t_func[i] + for i in range(len(self.plt_attr)) + } + ) + .reset_index() + ) + df_tot[self.group_attr] = "All" + self.df = pd.concat( + [self.df, df_tot], axis=0, ignore_index=True + ).sort_values(by=self.time_attr) self.k_total = self.get_k_period() y_min = self.plt_config[self.plt_attr[0]][3] - if y_min is None: + if y_min is None: self.y_max = self.get_y_max(same_scale) self.y_min = self.get_y_min(same_scale) if self.n_yticks is not None: @@ -109,42 +167,53 @@ def __init__(self,t0, t1, df, plt_config, ax_digit, x_label, y_labels, x_format= else: self.yticks = None c = [self.time_attr] + self.plt_attr - self.df_grouped = self.df.groupby(['num_periods', self.group_attr]).agg({i: lambda x:x.to_list() for i in c}).sort_values(by=['num_periods', self.group_attr], ascending=False).reset_index() + self.df_grouped = ( + self.df.groupby(["num_periods", self.group_attr]) + .agg({i: lambda x: x.to_list() for i in c}) + .sort_values(by=["num_periods", self.group_attr], ascending=False) + .reset_index() + ) self.fig, self.spec, self.axs, self.twin_axs = self.my_fig() - self.step=step + self.step = step self.ax_digit = ax_digit - + def get_g_titles(self): group_title = self.df[self.group_attr].unique().tolist() group_title = sorted(group_title) - if len(self.spcl_g)> 0: + if len(self.spcl_g) > 0: group_title = self.spcl_g elif self.plt_t: - group_title = ['All'] + group_title + group_title = ["All"] + group_title return group_title - + def get_k_period(self): - if not (isinstance(self.plt_period, np.timedelta64) or self.plt_period is False): + if not ( + isinstance(self.plt_period, np.timedelta64) or self.plt_period is False + ): raise ValueError("The plot_period is not of type np.timedelta64 or False") if self.plt_period is False: k_total = 1 - self.plt_period = (self.t1_lc_dt - self.t0_lc_dt) - self.df['num_periods'] = 1 + self.plt_period = self.t1_lc_dt - self.t0_lc_dt + self.df["num_periods"] = 1 else: t0 = self.df[self.time_attr].max() + self.plt_period - self.df['num_periods'] = (t0 - self.df[self.time_attr])/self.plt_period - self.df['num_periods'] = self.df['num_periods'].astype(int) - k_total = self.df['num_periods'].unique().shape[0] + self.df["num_periods"] = (t0 - self.df[self.time_attr]) / self.plt_period + self.df["num_periods"] = self.df["num_periods"].astype(int) + k_total = self.df["num_periods"].unique().shape[0] # k_total = (self.t1_lc_dt - self.t0_lc_dt)/self.plt_period - return np.ceil(k_total).astype(int)*np.unique(self.plt_loc[:, 0]).shape[0] - + return np.ceil(k_total).astype(int) * np.unique(self.plt_loc[:, 0]).shape[0] + def get_y_max(self, same_scale): y_max = None if same_scale: if self.plt_t: # y_max = self.df.groupby(self.time_attr)[self.plt_attr].agg({ # self.plt_attr[i]: self.plt_t_func[i] for i in range(len(self.plt_attr))}).max().tolist() - y_max = self.df[self.df[self.group_attr]=='All'][self.plt_attr].max().tolist() + y_max = ( + self.df[self.df[self.group_attr] == "All"][self.plt_attr] + .max() + .tolist() + ) else: y_max = self.df[self.plt_attr].max().tolist() y_max = [i for i in y_max] @@ -157,15 +226,19 @@ def get_y_max(self, same_scale): ymax_dict[loc] = max(ymax_dict[loc], val) # Step 2: Reassign y_max so each entry reflects the max at its location - y_max = [ymax_dict[loc]+.1 for loc in y_loc] - + y_max = [ymax_dict[loc] + 0.1 for loc in y_loc] + return y_max def get_y_min(self, same_scale): y_min = None if same_scale: if self.plt_t: - y_min = self.df[self.df[self.group_attr]=='All'][self.plt_attr].min().tolist() + y_min = ( + self.df[self.df[self.group_attr] == "All"][self.plt_attr] + .min() + .tolist() + ) else: y_min = self.df[self.plt_attr].min().tolist() y_min = [i for i in y_min] @@ -178,60 +251,77 @@ def get_y_min(self, same_scale): y_min_dict[loc] = min(y_min_dict[loc], val) # Step 2: Reassign y_max so each entry reflects the max at its location - y_min = [y_min_dict[loc]-.1 for loc in y_loc] + y_min = [y_min_dict[loc] - 0.1 for loc in y_loc] return y_min - + def get_yticks(self, same_scale): if same_scale: - yticks = [np.linspace(self.y_min[i], self.y_max[i], self.n_yticks) for i in range(len(self.y_max))] + yticks = [ + np.linspace(self.y_min[i], self.y_max[i], self.n_yticks) + for i in range(len(self.y_max)) + ] else: yticks = None return yticks - + def check_timedelta_E2P(self, MW): - q_timedelta = self.df.groupby(self.group_attr).agg({self.time_attr: lambda x: x.diff().dropna().unique()}).reset_index(drop=False) + q_timedelta = ( + self.df.groupby(self.group_attr) + .agg({self.time_attr: lambda x: x.diff().dropna().unique()}) + .reset_index(drop=False) + ) q_timedelta = np.unique(q_timedelta[self.time_attr].values.all()) - kwh_to_kw = (np.timedelta64(1, 'h')/q_timedelta[0]).astype(int) + kwh_to_kw = (np.timedelta64(1, "h") / q_timedelta[0]).astype(int) print(kwh_to_kw) - q_timedelta = [i.astype('int64')/(10**9*60) for i in q_timedelta] - if len(q_timedelta)>1: - warnings.warn(f'different time resolution (minute): {q_timedelta}') + q_timedelta = [i.astype("int64") / (10**9 * 60) for i in q_timedelta] + if len(q_timedelta) > 1: + warnings.warn(f"different time resolution (minute): {q_timedelta}") # else: # print(Fore.GREEN+f" time resolution (minute): {q_timedelta}") self.df.loc[:, self.E2P_attr] *= kwh_to_kw if MW: self.df.loc[:, self.E2P_attr] /= 1000 - - - + def my_fig(self, ncols=1): - matplotlib.rc('font', **self.font) - fig = plt.figure(constrained_layout=True, figsize=[ncols*self.figsize[0], self.k_total*self.figsize[1]], dpi=self.dpi) + matplotlib.rc("font", **self.font) + fig = plt.figure( + constrained_layout=True, + figsize=[ncols * self.figsize[0], self.k_total * self.figsize[1]], + dpi=self.dpi, + ) spec = GridSpec(self.k_total, ncols, figure=fig) - axs = [0]*self.k_total - twin_ax = [0]*self.k_total + axs = [0] * self.k_total + twin_ax = [0] * self.k_total for k in range(self.k_total): axs[k] = fig.add_subplot(spec[k, 0]) axs[k].margins(x=0) axs[k].set_xlabel(self.x_label) - if self.onlyntime and k != self.k_total-1: + if self.onlyntime and k != self.k_total - 1: # axs[k].set_xlabel() - axs[k].xaxis.label.set_visible(False) + axs[k].xaxis.label.set_visible(False) twin_ax[k] = axs[k].twinx() twin_ax[k].margins(x=0) return fig, spec, axs, twin_ax - - + def save_fig(self): - if len(self.save_path)>1: + if len(self.save_path) > 1: plt.ioff() - self.fig.savefig(os.getcwd()+'/'+self.save_path, transparent=True, bbox_inches=self.bbox_inches, pad_inches=0, dpi=self.dpi) - print('saved as: ', os.getcwd()+'/'+self.save_path) + self.fig.savefig( + os.getcwd() + "/" + self.save_path, + transparent=True, + bbox_inches=self.bbox_inches, + pad_inches=0, + dpi=self.dpi, + ) + print("saved as: ", os.getcwd() + "/" + self.save_path) plt.close(self.fig) - + def color_nights_func(self): for i in range(0, self.k_total, np.unique(self.plt_loc[:, 0]).shape[0]): - t0_dt = self.t0_lc_dt + i/(np.unique(self.plt_loc[:, 0]).shape[0])*self.plt_period + t0_dt = ( + self.t0_lc_dt + + i / (np.unique(self.plt_loc[:, 0]).shape[0]) * self.plt_period + ) t1_dt = t0_dt + self.plt_period night_t0_dt = (t0_dt - timedelta(days=1)).replace(hour=22, minute=0) night_t1_dt = (t1_dt - self.plt_period).replace(hour=7, minute=0) @@ -244,32 +334,49 @@ def color_nights_func(self): period_1 = min(t1_num, night_t1_num) if period_1 > period_0: for ii in range(np.unique(self.plt_loc[:, 0]).shape[0]): - self.axs[i+ii].axvspan(period_0, period_1, alpha=0.1, color='black', label='Night Time') - night_t0_dt = night_t0_dt+timedelta(days=1) + self.axs[i + ii].axvspan( + period_0, + period_1, + alpha=0.1, + color="black", + label="Night Time", + ) + night_t0_dt = night_t0_dt + timedelta(days=1) night_t0_num = date2num(night_t0_dt) - night_t1_dt =night_t1_dt+timedelta(days=1) + night_t1_dt = night_t1_dt + timedelta(days=1) night_t1_num = date2num(night_t1_dt) - def do(self): - num_periods = self.df_grouped['num_periods'].unique() + num_periods = self.df_grouped["num_periods"].unique() num_plots_per = np.unique(self.plt_loc[:, 0]).shape[0] k_period = -1 for k in range(0, self.k_total, num_plots_per): - if k % num_plots_per==0: + if k % num_plots_per == 0: k_period += 1 - - df_part = self.df_grouped[self.df_grouped['num_periods']==num_periods[k_period]] + + df_part = self.df_grouped[ + self.df_grouped["num_periods"] == num_periods[k_period] + ] time_axis = df_part[self.time_attr] time_axis = np.asarray(time_axis)[0] - xticks = [time_axis[i] for i in np.ceil(np.linspace(0, len(time_axis)-1, self.n_ticks, endpoint=True)).astype(int)] - xlabels = [time_axis[i].strftime(self.x_format) for i in np.ceil(np.linspace(0, len(time_axis)-1, self.n_ticks, endpoint=True)).astype(int)] + xticks = [ + time_axis[i] + for i in np.ceil( + np.linspace(0, len(time_axis) - 1, self.n_ticks, endpoint=True) + ).astype(int) + ] + xlabels = [ + time_axis[i].strftime(self.x_format) + for i in np.ceil( + np.linspace(0, len(time_axis) - 1, self.n_ticks, endpoint=True) + ).astype(int) + ] j_color = 0 for g_title in self.g_titles: - # for plt_attr in self.plt_attr: + # for plt_attr in self.plt_attr: for plt_attr in self.plt_attr: - # for g_title in self.g_titles: - y_axis = df_part[df_part[self.group_attr]==g_title][plt_attr] + # for g_title in self.g_titles: + y_axis = df_part[df_part[self.group_attr] == g_title][plt_attr] y_axis = np.asarray(y_axis)[0] fig_num = self.plt_config[plt_attr][0] ax_loc = self.plt_config[plt_attr][1] @@ -286,100 +393,151 @@ def do(self): yticks = np.linspace(min(y_axis), max(y_axis), self.n_yticks) else: yticks = self.yticks[self.plt_attr.index(plt_attr)] - if ax_loc==0: - ax = self.axs[k+fig_num] + if ax_loc == 0: + ax = self.axs[k + fig_num] if self.step: - ax.step(time_axis, y_axis, where='post', color = linecolor, linestyle=linestyle) + ax.step( + time_axis, + y_axis, + where="post", + color=linecolor, + linestyle=linestyle, + ) else: - ax.plot(time_axis, y_axis, color = linecolor, linestyle=linestyle) + ax.plot( + time_axis, y_axis, color=linecolor, linestyle=linestyle + ) ax.grid(linewidth=self.gridwidth[0]) if self.onlyntime: - if ax==self.axs[-1]: - ax.set_xticklabels(xlabels, rotation=self.rotation, ha='right') + if ax == self.axs[-1]: + ax.set_xticklabels( + xlabels, rotation=self.rotation, ha="right" + ) else: - ax.set_xticklabels([], rotation=self.rotation, ha='right') + ax.set_xticklabels( + [], rotation=self.rotation, ha="right" + ) else: - ax.set_xticklabels(xlabels, rotation=self.rotation, ha='right') + ax.set_xticklabels( + xlabels, rotation=self.rotation, ha="right" + ) else: - ax = self.twin_axs[k+fig_num] + ax = self.twin_axs[k + fig_num] if self.step: - ax.step(time_axis, y_axis, where='post', color = linecolor, linestyle=linestyle) + ax.step( + time_axis, + y_axis, + where="post", + color=linecolor, + linestyle=linestyle, + ) else: - ax.plot(time_axis, y_axis, color = linecolor, linestyle=linestyle) - ax.grid(linewidth=self.gridwidth[1], linestyle=':') + ax.plot( + time_axis, y_axis, color=linecolor, linestyle=linestyle + ) + ax.grid(linewidth=self.gridwidth[1], linestyle=":") t0_dt = time_axis[0] t1_dt = time_axis[-1] - + if self.only1title: if ax == self.axs[0]: if t0_dt.date() != t1_dt.date(): - if len(str(t0_dt.date())[self.title_i:self.title_j]) > 0: - ax.set_title(f'{self.title} {str(t0_dt.date())[self.title_i:self.title_j]}---{str(t1_dt.date())[self.title_i:self.title_j]}') + if ( + len(str(t0_dt.date())[self.title_i : self.title_j]) + > 0 + ): + ax.set_title( + f"{self.title} {str(t0_dt.date())[self.title_i : self.title_j]}---{str(t1_dt.date())[self.title_i : self.title_j]}" + ) else: - ax.set_title(f'{self.title} {str(t0_dt.date())[self.title_i:self.title_j]}') + ax.set_title( + f"{self.title} {str(t0_dt.date())[self.title_i : self.title_j]}" + ) else: if t0_dt.date() != t1_dt.date(): - if len(str(t0_dt.date())[self.title_i:self.title_j]) > 0: - ax.set_title(f'{self.title} {str(t0_dt.date())[self.title_i:self.title_j]}---{str(t1_dt.date())[self.title_i:self.title_j]}') + if len(str(t0_dt.date())[self.title_i : self.title_j]) > 0: + ax.set_title( + f"{self.title} {str(t0_dt.date())[self.title_i : self.title_j]}---{str(t1_dt.date())[self.title_i : self.title_j]}" + ) else: - ax.set_title(f'{self.title} {str(t0_dt.date())[self.title_i:self.title_j]}') - + ax.set_title( + f"{self.title} {str(t0_dt.date())[self.title_i : self.title_j]}" + ) + ax.set_ylabel(self.y_labels[self.plt_attr.index(plt_attr)]) - ax.set_xlim(min(time_axis), max(time_axis)) + ax.set_xlim(min(time_axis), max(time_axis)) ax.set_xticks(xticks) ax.set_yticks(yticks) - + ax.yaxis.set_major_formatter(FuncFormatter(self.format_ticks)) if y_min is not None: ax.set_ylim(y_min, y_max) elif self.y_max is not None: - ax.set_ylim(self.y_min[self.plt_attr.index(plt_attr)], self.y_max[self.plt_attr.index(plt_attr)]) - if self.color_by == 'attribute': - j_color += 1 - if self.color_by == 'group': + ax.set_ylim( + self.y_min[self.plt_attr.index(plt_attr)], + self.y_max[self.plt_attr.index(plt_attr)], + ) + if self.color_by == "attribute": j_color += 1 - + if self.color_by == "group": + j_color += 1 + if self.show_legends: - self.set_legends() + self.set_legends() if self.color_nights: self.color_nights_func() self.save_fig() - + def set_legends(self): self.lgnd_loc = cycle(self.lgnd_loc) plt_attr = np.char.array(self.plt_attr) for k in range(0, self.k_total, np.unique(self.plt_loc[:, 0]).shape[0]): for i in range(np.unique(self.plt_loc[:, 0]).shape[0]): - ax_0_idx = np.logical_and(self.plt_loc[:, 0] == i, self.plt_loc[:, 1] ==0) - ax_1_idx = np.logical_and(self.plt_loc[:, 0] == i, self.plt_loc[:, 1] ==1) + ax_0_idx = np.logical_and( + self.plt_loc[:, 0] == i, self.plt_loc[:, 1] == 0 + ) + ax_1_idx = np.logical_and( + self.plt_loc[:, 0] == i, self.plt_loc[:, 1] == 1 + ) lgnds_0 = plt_attr[ax_0_idx] # print(k, i, lgnds_0) - lgnds_0 = [ll[self.legend_i:self.legend_j] for ll in lgnds_0] + lgnds_0 = [ll[self.legend_i : self.legend_j] for ll in lgnds_0] lgnds_1 = plt_attr[ax_1_idx] # print(k, i, lgnds_1) - lgnds_1 = [ll[self.legend_i:self.legend_j] for ll in lgnds_1] + lgnds_1 = [ll[self.legend_i : self.legend_j] for ll in lgnds_1] if len(lgnds_0) > 0: - self.axs[k+i].legend([str(g_title)+self.legend_join+str(lgnd) for g_title in self.g_titles for lgnd in lgnds_0 ], loc=next(self.lgnd_loc)) + self.axs[k + i].legend( + [ + str(g_title) + self.legend_join + str(lgnd) + for g_title in self.g_titles + for lgnd in lgnds_0 + ], + loc=next(self.lgnd_loc), + ) # self.axs[k+i].grid(linewidth=self.gridwidth[0]) else: - self.axs[k+i].set_yticks([]) + self.axs[k + i].set_yticks([]) if len(lgnds_1) > 0: # pass - legend_vec = [str(g_title)+self.legend_join+str(lgnd) for g_title in self.g_titles for lgnd in lgnds_1 ] + legend_vec = [ + str(g_title) + self.legend_join + str(lgnd) + for g_title in self.g_titles + for lgnd in lgnds_1 + ] if self.lim_legend is not None: - lines = self.twin_axs[k+i].get_lines() + lines = self.twin_axs[k + i].get_lines() lines[-1].set_label(self.lim_legend) - self.twin_axs[k+i].legend(loc=self.lgnd_loc[1]) + self.twin_axs[k + i].legend(loc=self.lgnd_loc[1]) else: - self.twin_axs[k+i].legend(legend_vec, loc=next(self.lgnd_loc)) + self.twin_axs[k + i].legend(legend_vec, loc=next(self.lgnd_loc)) # self.twin_axs[k+i].grid(linewidth=self.gridwidth[1], linestyle=':') else: - self.twin_axs[k+i].set_yticks([]) + self.twin_axs[k + i].set_yticks([]) def format_ticks(self, x, pos): - string_format = '{:0' + self.ax_digit + '}' + string_format = "{:0" + self.ax_digit + "}" return string_format.format(x) - + def remove_yellow(cmap_name): cmap = plt.get_cmap(cmap_name) @@ -389,4 +547,4 @@ def remove_yellow(cmap_name): if np.allclose(color[:3], [1, 1, 0], atol=0.1): # Adjust tolerance as needed colors[i] = [1, 0.65, 0, 1] # Replace yellow with black or another color # colors[i] = None # Replace yellow with black or another color - return LinearSegmentedColormap.from_list('custom_cmap', colors) \ No newline at end of file + return LinearSegmentedColormap.from_list("custom_cmap", colors)