From 395b8386fdb1b54c3c27ba9ee77ec6ea376a58e0 Mon Sep 17 00:00:00 2001
From: wingertkm-NIH <kristen.wingert@nih.gov>
Date: Wed, 27 May 2026 08:23:54 -0400
Subject: [PATCH 1/2] Update STAT-tutorial.ipynb

---
 .../ncbi-stat-tutorial/STAT-tutorial.ipynb    | 242 ++++++++++++++++--
 1 file changed, 221 insertions(+), 21 deletions(-)

diff --git a/notebooks/ncbi-stat-tutorial/STAT-tutorial.ipynb b/notebooks/ncbi-stat-tutorial/STAT-tutorial.ipynb
index 5c72f2d..e38d449 100644
--- a/notebooks/ncbi-stat-tutorial/STAT-tutorial.ipynb
+++ b/notebooks/ncbi-stat-tutorial/STAT-tutorial.ipynb
@@ -256,32 +256,232 @@
     "df[columns_to_write].to_csv(output_file, sep='\\t', index=False)"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "04eabbd9-24dc-4035-8dee-dd6f049a3270",
-   "metadata": {},
-   "source": [
+ {
+  "cell_type": "markdown",
+  "id": "04eabbd9-24dc-4035-8dee-dd6f049a3270",
+  "metadata": {},
+  "source": [
     "If you want to experiment a bit, rerun the query with a different tax id, modify the total_count, and modify the time Interval and see how your results change. Or, we can run a few more example queries from the [NCBI STAT page](https://www.ncbi.nlm.nih.gov/sra/docs/sra-cloud-based-taxonomy-analysis-table/). "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "2f8d42ae",
-   "metadata": {},
-   "source": [
+  ]
+},
+{
+  "cell_type": "code",
+  "execution_count": null,
+  "id": "geo-graph-code",
+  "metadata": {},
+  "outputs": [],
+  "source": [
+    "# -----------------------------------------------------------------------------\n",
+    "# Install dependencies if needed\n",
+    "# -----------------------------------------------------------------------------\n",
+    "# !pip install pandas plotly numpy\n",
+    "\n",
+    "# -----------------------------------------------------------------------------\n",
+    "# Imports\n",
+    "# -----------------------------------------------------------------------------\n",
+    "import ast\n",
+    "import re\n",
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import plotly.express as px\n",
+    "\n",
+    "# -----------------------------------------------------------------------------\n",
+    "# Load data\n",
+    "# -----------------------------------------------------------------------------\n",
+    "df.columns = [c.strip().lower() for c in df.columns]\n",
+    "\n",
+    "# -----------------------------------------------------------------------------\n",
+    "# Type cleanup\n",
+    "# -----------------------------------------------------------------------------\n",
+    "if \"releasedate\" in df.columns:\n",
+    "    df[\"releasedate\"] = pd.to_datetime(df[\"releasedate\"], errors=\"coerce\")\n",
+    "\n",
+    "for col in [\"total_count\", \"self_count\"]:\n",
+    "    if col in df.columns:\n",
+    "        df[col] = pd.to_numeric(df[col], errors=\"coerce\")\n",
+    "\n",
+    "# -----------------------------------------------------------------------------\n",
+    "# Robust geography parser\n",
+    "# -----------------------------------------------------------------------------\n",
+    "MISSING_LIKE = {\n",
+    "    \"\", \"[]\", \"['']\", \"['missing']\", \"['not applicable']\",\n",
+    "    \"['not collected']\", \"['not provided']\", \"missing\", \"not applicable\",\n",
+    "    \"not collected\", \"not provided\", \"nan\", \"none\"\n",
+    "}\n",
+    "\n",
+    "COUNTRY_ALIASES = {\n",
+    "    \"USA\": \"United States\",\n",
+    "    \"U.S.A.\": \"United States\",\n",
+    "    \"US\": \"United States\",\n",
+    "    \"UK\": \"United Kingdom\",\n",
+    "    \"Korea\": \"South Korea\",\n",
+    "    \"South Korea\": \"South Korea\",\n",
+    "    \"North Korea\": \"North Korea\",\n",
+    "    \"Russian Federation\": \"Russia\",\n",
+    "}\n",
+    "\n",
+    "def parse_geo_loc(value):\n",
+    "    if value is None:\n",
+    "        return None, None, None\n",
+    "\n",
+    "    if isinstance(value, float) and pd.isna(value):\n",
+    "        return None, None, None\n",
+    "\n",
+    "    if isinstance(value, np.ndarray):\n",
+    "        if value.size == 0:\n",
+    "            return None, None, None\n",
+    "        if value.size == 1:\n",
+    "            value = value.item()\n",
+    "        else:\n",
+    "            value = value.tolist()\n",
+    "\n",
+    "    s = str(value).strip()\n",
+    "\n",
+    "    if not s or s.lower() in MISSING_LIKE:\n",
+    "        return None, None, None\n",
+    "\n",
+    "    try:\n",
+    "        parsed = ast.literal_eval(s)\n",
+    "        if isinstance(parsed, list):\n",
+    "            if len(parsed) == 0:\n",
+    "                return None, None, None\n",
+    "            s = str(parsed[0]).strip()\n",
+    "        else:\n",
+    "            s = str(parsed).strip()\n",
+    "    except Exception:\n",
+    "        s = s.strip(\"[]\").strip(\"'\").strip('\"').strip()\n",
+    "\n",
+    "    if not s or s.lower() in MISSING_LIKE:\n",
+    "        return None, None, None\n",
+    "\n",
+    "    s = re.sub(r\"\\s+\", \" \", s).strip()\n",
+    "\n",
+    "    if \":\" in s:\n",
+    "        country, region = [x.strip() for x in s.split(\":\", 1)]\n",
+    "    else:\n",
+    "        country, region = s, None\n",
+    "\n",
+    "    country = COUNTRY_ALIASES.get(country, country)\n",
+    "\n",
+    "    return country, region, s\n",
+    "\n",
+    "# -----------------------------------------------------------------------------\n",
+    "# Apply parser\n",
+    "# -----------------------------------------------------------------------------\n",
+    "if \"geo_loc_name_sam\" not in df.columns:\n",
+    "    raise KeyError(\"The file does not contain a 'geo_loc_name_sam' column.\")\n",
+    "\n",
+    "geo = df[\"geo_loc_name_sam\"].apply(parse_geo_loc)\n",
+    "\n",
+    "df[\"country\"] = geo.apply(lambda x: x[0])\n",
+    "df[\"region\"] = geo.apply(lambda x: x[1])\n",
+    "df[\"geo_clean\"] = geo.apply(lambda x: x[2])\n",
+    "\n",
+    "geo_df = df.dropna(subset=[\"country\"]).copy()\n",
+    "\n",
+    "# -----------------------------------------------------------------------------\n",
+    "# Country-level aggregation\n",
+    "# -----------------------------------------------------------------------------\n",
+    "country_counts = (\n",
+    "    geo_df.groupby(\"country\", as_index=False)\n",
+    "    .size()\n",
+    "    .rename(columns={\"size\": \"submissions\"})\n",
+    "    .sort_values(\"submissions\", ascending=False)\n",
+    ")\n",
+    "\n",
+    "top20_countries = country_counts.head(20).copy()\n",
+    "\n",
+    "# -----------------------------------------------------------------------------\n",
+    "# 1) Choropleth map: submissions by country\n",
+    "# -----------------------------------------------------------------------------\n",
+    "fig_choropleth = px.choropleth(\n",
+    "    country_counts,\n",
+    "    locations=\"country\",\n",
+    "    locationmode=\"country names\",\n",
+    "    color=\"submissions\",\n",
+    "    color_continuous_scale=\"Viridis\",\n",
+    "    title=\"Submissions by Country\",\n",
+    ")\n",
+    "\n",
+    "fig_choropleth.update_layout(\n",
+    "    geo=dict(showframe=False, showcoastlines=True, projection_type=\"natural earth\"),\n",
+    "    margin=dict(l=0, r=0, t=50, b=0),\n",
+    ")\n",
+    "\n",
+    "fig_choropleth.show()\n",
+    "\n",
+    "# -----------------------------------------------------------------------------\n",
+    "# 2) Bar chart: top countries\n",
+    "# -----------------------------------------------------------------------------\n",
+    "fig_top_countries = px.bar(\n",
+    "    top20_countries.sort_values(\"submissions\", ascending=True),\n",
+    "    x=\"submissions\",\n",
+    "    y=\"country\",\n",
+    "    orientation=\"h\",\n",
+    "    title=\"Top 20 Countries by Submissions\",\n",
+    "    labels={\"submissions\": \"Submissions\", \"country\": \"Country\"},\n",
+    ")\n",
+    "\n",
+    "fig_top_countries.update_layout(\n",
+    "    yaxis=dict(categoryorder=\"total ascending\"),\n",
+    "    margin=dict(l=20, r=20, t=50, b=20),\n",
+    ")\n",
+    "\n",
+    "fig_top_countries.show()\n",
+    "\n",
+    "# -----------------------------------------------------------------------------\n",
+    "# 3) Bar chart: top countries/regions\n",
+    "# -----------------------------------------------------------------------------\n",
+    "geo_df[\"location_label\"] = geo_df.apply(\n",
+    "    lambda r: f\"{r['country']}: {r['region']}\" if pd.notna(r[\"region\"]) and r[\"region\"] else r[\"country\"],\n",
+    "    axis=1\n",
+    ")\n",
+    "\n",
+    "location_counts = (\n",
+    "    geo_df.groupby(\"location_label\", as_index=False)\n",
+    "    .size()\n",
+    "    .rename(columns={\"size\": \"submissions\"})\n",
+    "    .sort_values(\"submissions\", ascending=False)\n",
+    ")\n",
+    "\n",
+    "top20_locations = location_counts.head(20).copy()\n",
+    "\n",
+    "fig_top_locations = px.bar(\n",
+    "    top20_locations.sort_values(\"submissions\", ascending=True),\n",
+    "    x=\"submissions\",\n",
+    "    y=\"location_label\",\n",
+    "    orientation=\"h\",\n",
+    "    title=\"Top 20 Countries / Regions by Submissions\",\n",
+    "    labels={\"submissions\": \"Submissions\", \"location_label\": \"Country / Region\"},\n",
+    ")\n",
+    "\n",
+    "fig_top_locations.update_layout(\n",
+    "    yaxis=dict(categoryorder=\"total ascending\"),\n",
+    "    margin=dict(l=20, r=20, t=50, b=20),\n",
+    ")\n",
+    "\n",
+    "fig_top_locations.show()\n"
+  ]
+},
+{
+  "cell_type": "markdown",
+  "id": "2f8d42ae",
+  "metadata": {},
+  "source": [
     "## Conclusion\n",
     "Here you ran SQL queries using BigQuery to view STAT tables hosted in the cloud. "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e7080684",
-   "metadata": {},
-   "source": [
+  ]
+},
+{
+  "cell_type": "markdown",
+  "id": "e7080684",
+  "metadata": {},
+  "source": [
     "## Clean Up\n",
     "Feel free to delete the Vertex AI instance where you ran this notebook"
-   ]
-  }
+  ]
+}
  ],
  "metadata": {},
  "nbformat": 4,

From 06b60be463d3c12b0d8c3185b401592cfe48b229 Mon Sep 17 00:00:00 2001
From: github-action <cbiit-github-action@github.com>
Date: Wed, 27 May 2026 12:24:16 +0000
Subject: [PATCH 2/2] Github Action: Lint Notebooks

---
 .../ncbi-stat-tutorial/STAT-tutorial.ipynb    | 60 +++++++++----------
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/notebooks/ncbi-stat-tutorial/STAT-tutorial.ipynb b/notebooks/ncbi-stat-tutorial/STAT-tutorial.ipynb
index e38d449..794c0b3 100644
--- a/notebooks/ncbi-stat-tutorial/STAT-tutorial.ipynb
+++ b/notebooks/ncbi-stat-tutorial/STAT-tutorial.ipynb
@@ -256,21 +256,21 @@
     "df[columns_to_write].to_csv(output_file, sep='\\t', index=False)"
    ]
   },
- {
-  "cell_type": "markdown",
-  "id": "04eabbd9-24dc-4035-8dee-dd6f049a3270",
-  "metadata": {},
-  "source": [
+  {
+   "cell_type": "markdown",
+   "id": "04eabbd9-24dc-4035-8dee-dd6f049a3270",
+   "metadata": {},
+   "source": [
     "If you want to experiment a bit, rerun the query with a different tax id, modify the total_count, and modify the time Interval and see how your results change. Or, we can run a few more example queries from the [NCBI STAT page](https://www.ncbi.nlm.nih.gov/sra/docs/sra-cloud-based-taxonomy-analysis-table/). "
-  ]
-},
-{
-  "cell_type": "code",
-  "execution_count": null,
-  "id": "geo-graph-code",
-  "metadata": {},
-  "outputs": [],
-  "source": [
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "geo-graph-code",
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "# -----------------------------------------------------------------------------\n",
     "# Install dependencies if needed\n",
     "# -----------------------------------------------------------------------------\n",
@@ -462,26 +462,26 @@
     ")\n",
     "\n",
     "fig_top_locations.show()\n"
-  ]
-},
-{
-  "cell_type": "markdown",
-  "id": "2f8d42ae",
-  "metadata": {},
-  "source": [
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2f8d42ae",
+   "metadata": {},
+   "source": [
     "## Conclusion\n",
     "Here you ran SQL queries using BigQuery to view STAT tables hosted in the cloud. "
-  ]
-},
-{
-  "cell_type": "markdown",
-  "id": "e7080684",
-  "metadata": {},
-  "source": [
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e7080684",
+   "metadata": {},
+   "source": [
     "## Clean Up\n",
     "Feel free to delete the Vertex AI instance where you ran this notebook"
-  ]
-}
+   ]
+  }
  ],
  "metadata": {},
  "nbformat": 4,