From dc6bc392798df43296e19412139c4fc2b1b0f3de Mon Sep 17 00:00:00 2001 From: Mark Melotto Date: Wed, 3 Jun 2026 15:01:06 +0200 Subject: [PATCH] adding grdc data to getting started --- book/_toc.yml | 1 + book/content/forcing/discharge.ipynb | 167 +++++++++++++++++++++++++++ book/content/generate_forcing.md | 6 + 3 files changed, 174 insertions(+) create mode 100644 book/content/forcing/discharge.ipynb diff --git a/book/_toc.yml b/book/_toc.yml index 471f71f..3a3f1bd 100644 --- a/book/_toc.yml +++ b/book/_toc.yml @@ -22,6 +22,7 @@ parts: - file: content/forcing/cmip_historic.ipynb - file: content/forcing/cmip_future.ipynb - file: content/forcing/manual_forcing.ipynb + - file: content/forcing/discharge.ipynb - file: content/different_models.md sections: diff --git a/book/content/forcing/discharge.ipynb b/book/content/forcing/discharge.ipynb new file mode 100644 index 0000000..b4e1ce2 --- /dev/null +++ b/book/content/forcing/discharge.ipynb @@ -0,0 +1,167 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a1b2c3d4e5f6a7b8", + "metadata": {}, + "source": [ + "# GRDC Discharge Observations\n", + "\n", + "The [Global Runoff Data Centre (GRDC)](https://portal.grdc.bafg.de/) is the primary source for historical daily and monthly river discharge data worldwide, covering thousands of gauging stations.\n", + "\n", + "Unlike the Caravan or ERA5 data, GRDC data requires a manual download step — you register on the GRDC portal, select the stations you need, and download the data files to your own machine.\n", + "Once you have the files, eWaterCycle makes loading them straightforward.\n", + "\n", + "What we need:\n", + "1. A GRDC station ID (found via the [GRDC station catalogue](https://portal.grdc.bafg.de/))\n", + "2. A time window" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b2c3d4e5f6a7b8c9", + "metadata": {}, + "outputs": [], + "source": [ + "# General python\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\", category=UserWarning)\n", + "\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "from pathlib import Path\n", + "\n", + "# Niceties\n", + "from rich import print\n", + "\n", + "# eWaterCycle observation module\n", + "from ewatercycle.observation.grdc import get_grdc_data" + ] + }, + { + "cell_type": "markdown", + "id": "c3d4e5f6a7b8c9d0", + "metadata": {}, + "source": [ + "## Downloading GRDC data\n", + "\n", + "**Normally** one would download the GRDC data.\n", + "We have some GRDC stations downloaded ourselves, if it is not in our database, please ask the admin to add your station.\n", + "Or you could do it yourself:\n", + "\n", + "1. Register (free) at [portal.grdc.bafg.de](https://portal.grdc.bafg.de/)\n", + "2. Search for your station by name, river, or country\n", + "3. Select the station and download the **daily** data as a `.txt` file\n", + "4. Place the downloaded file(s) in a directory on your machine — that path goes into `grdc_data_home` below\n", + "\n", + "The GRDC station ID is a 7-digit number visible in the portal and in the filename of the downloaded file (e.g. `GRDC_6335020_Q_Day.txt`)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d4e5f6a7b8c9d0e1", + "metadata": {}, + "outputs": [], + "source": [ + "# Rhine at Lobith — one of the most monitored river cross-sections in Europe\n", + "station_id = \"6335020\"\n", + "\n", + "experiment_start_date = \"2000-01-01T00:00:00Z\"\n", + "experiment_end_date = \"2005-12-31T00:00:00Z\"" + ] + }, + { + "cell_type": "markdown", + "id": "e5f6a7b8c9d0e1f2", + "metadata": {}, + "source": [ + "## Loading the observations\n", + "\n", + "`get_grdc_data` reads the downloaded GRDC file and returns an xarray dataset of daily discharge values also containing station information (name, river, coordinates, drainage area, etc.)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f6a7b8c9d0e1f2a3", + "metadata": {}, + "outputs": [], + "source": [ + "observations_ds = get_grdc_data(\n", + " station_id=station_id,\n", + " start_time=experiment_start_date,\n", + " end_time=experiment_end_date,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "c9d0e1f2a3b4c5d6", + "metadata": {}, + "source": "The station metadata (name, river, country, coordinates) is embedded as scalar variables in the dataset.\nThe discharge timeseries is in `streamflow`, with `-999` used as a missing value — we mask those before plotting." + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d0e1f2a3b4c5d6e7", + "metadata": {}, + "outputs": [], + "source": "print(observations_ds)" + }, + { + "cell_type": "markdown", + "id": "e1f2a3b4c5d6e7f8", + "metadata": {}, + "source": [ + "## Plotting the discharge\n", + "\n", + "The observations are a pandas Series with a DatetimeIndex, so they can be plotted directly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2a3b4c5d6e7f8a9", + "metadata": {}, + "outputs": [], + "source": [ + "station_name = str(observations_ds[\"station_name\"].values)\n", + "river_name = str(observations_ds[\"river_name\"].values)\n", + "\n", + "# Mask the -999 missing value sentinel before plotting\n", + "streamflow = observations_ds[\"streamflow\"].where(observations_ds\n", + " [\"streamflow\"] != -999)\n", + "\n", + "plt.figure(figsize=(12, 4))\n", + "streamflow.plot(color=\"steelblue\")\n", + "plt.title(f\"GRDC station {station_id} — {station_name} ({river_name})\")\n", + "plt.xlabel(\"Date\")\n", + "plt.ylabel(\"Discharge (m³/s)\")\n", + "plt.tight_layout()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbformat_minor": 5, + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/book/content/generate_forcing.md b/book/content/generate_forcing.md index b94f30b..e310789 100644 --- a/book/content/generate_forcing.md +++ b/book/content/generate_forcing.md @@ -11,6 +11,7 @@ Every model needs forcing data, there are several possible ways to get this forc - CMIP6 historical data - CMIP6 future data - Manual data input +- GRDC discharge observations eWaterCycle supports different types of forcings, currently it supports: @@ -46,6 +47,11 @@ The forcing object in eWaterCycle has some properties: - filenames, a dictionary containing the paths to the netCDF files where the data is stored for that variable - dataset, which I will cover in more detail below. +## Observations + +It is also possible to use GRDC discharge observations to use as ground truth in your research. +We support various GRDC stations already, if your data is not on the server but on the GRDC data storage, please ask an admin to help you. + ### Technical Details This is for advanced users that will need to use different datasets.