diff --git a/outputs/SRR11940660/heatmap_rank_genes_heatmap.png b/outputs/SRR11940660/heatmap_rank_genes_heatmap.png new file mode 100644 index 0000000..b659e3a Binary files /dev/null and b/outputs/SRR11940660/heatmap_rank_genes_heatmap.png differ diff --git a/outputs/SRR11940660/pca_loadings_pca_loadings_1_2.png b/outputs/SRR11940660/pca_loadings_pca_loadings_1_2.png new file mode 100644 index 0000000..9593506 Binary files /dev/null and b/outputs/SRR11940660/pca_loadings_pca_loadings_1_2.png differ diff --git a/outputs/SRR11940660/pca_pca_leiden.png b/outputs/SRR11940660/pca_pca_leiden.png new file mode 100644 index 0000000..f1ac343 Binary files /dev/null and b/outputs/SRR11940660/pca_pca_leiden.png differ diff --git a/outputs/SRR11940660/pca_pca_scatter.png b/outputs/SRR11940660/pca_pca_scatter.png new file mode 100644 index 0000000..df4e59e Binary files /dev/null and b/outputs/SRR11940660/pca_pca_scatter.png differ diff --git a/outputs/SRR11940660/pca_variance_ratio_pca_variance.png b/outputs/SRR11940660/pca_variance_ratio_pca_variance.png new file mode 100644 index 0000000..60dc83b Binary files /dev/null and b/outputs/SRR11940660/pca_variance_ratio_pca_variance.png differ diff --git a/outputs/SRR11940660/rank_genes_groups_leiden_rank_genes.png b/outputs/SRR11940660/rank_genes_groups_leiden_rank_genes.png new file mode 100644 index 0000000..10243c8 Binary files /dev/null and b/outputs/SRR11940660/rank_genes_groups_leiden_rank_genes.png differ diff --git a/outputs/SRR11940660/top_genes_PC1.csv b/outputs/SRR11940660/top_genes_PC1.csv new file mode 100644 index 0000000..2a1c590 --- /dev/null +++ b/outputs/SRR11940660/top_genes_PC1.csv @@ -0,0 +1,31 @@ +gene,loading,abs_loading +gene-U712_00580,0.12533931,0.12533931 +gene-U712_00585,0.10191295,0.10191295 +gene-U712_00700,0.09746355,0.09746355 +gene-U712_00735,0.09316955,0.09316955 +gene-U712_00625,0.091827266,0.091827266 +gene-U712_00600,0.09159618,0.09159618 +gene-U712_10210,0.091315836,0.091315836 +gene-U712_00695,0.09052001,0.09052001 +gene-U712_16965,0.089866,0.089866 +gene-U712_07665,0.086747095,0.086747095 +gene-U712_00705,0.08656324,0.08656324 +gene-U712_07310,0.08565827,0.08565827 +gene-U712_07675,0.08170622,0.08170622 +gene-U712_00570,0.08053239,0.08053239 +gene-U712_19840,0.0794902,0.0794902 +gene-U712_07300,0.07897619,0.07897619 +gene-U712_00560,0.0770236,0.0770236 +gene-U712_00615,0.07680274,0.07680274 +gene-U712_00655,0.07668921,0.07668921 +gene-U712_20290,0.076166674,0.076166674 +gene-U712_20750,0.07560649,0.07560649 +gene-U712_00630,0.07493979,0.07493979 +gene-U712_00610,0.07472653,0.07472653 +gene-U712_13780,0.07378898,0.07378898 +gene-U712_13925,0.07184191,0.07184191 +gene-U712_20755,0.071589574,0.071589574 +gene-U712_00665,0.07119978,0.07119978 +gene-U712_16980,0.07061581,0.07061581 +gene-U712_19250,0.07054084,0.07054084 +gene-U712_04730,0.07048743,0.07048743 diff --git a/outputs/SRR11940660/top_genes_PC2.csv b/outputs/SRR11940660/top_genes_PC2.csv new file mode 100644 index 0000000..2abc4a2 --- /dev/null +++ b/outputs/SRR11940660/top_genes_PC2.csv @@ -0,0 +1,31 @@ +gene,loading,abs_loading +gene-U712_14360,0.19461678,0.19461678 +gene-U712_08455,0.16187526,0.16187526 +gene-U712_19480,0.1406494,0.1406494 +gene-U712_14420,0.13294719,0.13294719 +gene-U712_08460,0.13027942,0.13027942 +gene-U712_05870,0.12474084,0.12474084 +gene-U712_04095,0.12371954,0.12371954 +gene-U712_04090,0.12287305,0.12287305 +gene-U712_19490,0.12224303,0.12224303 +gene-U712_10210,0.122157894,0.122157894 +gene-U712_04085,0.121980615,0.121980615 +gene-U712_14415,0.11803285,0.11803285 +gene-U712_09470,0.11435959,0.11435959 +gene-U712_19485,0.11332339,0.11332339 +gene-U712_01805,0.11167637,0.11167637 +gene-U712_04100,0.11119464,0.11119464 +gene-U712_20290,0.108160555,0.108160555 +gene-U712_15180,0.10660463,0.10660463 +gene-U712_15910,0.10639286,0.10639286 +gene-U712_01810,0.10413831,0.10413831 +gene-U712_11735,0.10216933,0.10216933 +gene-U712_14425,0.101383075,0.101383075 +gene-U712_10180,0.09884069,0.09884069 +gene-U712_15925,0.09504657,0.09504657 +gene-U712_03020,0.09022796,0.09022796 +gene-U712_14355,0.08683693,0.08683693 +gene-U712_17700,0.08638535,0.08638535 +gene-U712_11720,0.086365364,0.086365364 +gene-U712_11730,0.08547053,0.08547053 +gene-U712_11725,0.08386802,0.08386802 diff --git a/outputs/SRR11940660/umap_umap_leiden.png b/outputs/SRR11940660/umap_umap_leiden.png new file mode 100644 index 0000000..6b6862f Binary files /dev/null and b/outputs/SRR11940660/umap_umap_leiden.png differ diff --git a/outputs/SRR11940660/umap_umap_qc.png b/outputs/SRR11940660/umap_umap_qc.png new file mode 100644 index 0000000..33c47fb Binary files /dev/null and b/outputs/SRR11940660/umap_umap_qc.png differ diff --git a/outputs/SRR11940661/heatmap_rank_genes_heatmap.png b/outputs/SRR11940661/heatmap_rank_genes_heatmap.png new file mode 100644 index 0000000..b571983 Binary files /dev/null and b/outputs/SRR11940661/heatmap_rank_genes_heatmap.png differ diff --git a/outputs/SRR11940661/pca_loadings_pca_loadings_1_2.png b/outputs/SRR11940661/pca_loadings_pca_loadings_1_2.png new file mode 100644 index 0000000..45286d2 Binary files /dev/null and b/outputs/SRR11940661/pca_loadings_pca_loadings_1_2.png differ diff --git a/outputs/SRR11940661/pca_pca_leiden.png b/outputs/SRR11940661/pca_pca_leiden.png new file mode 100644 index 0000000..abd08a1 Binary files /dev/null and b/outputs/SRR11940661/pca_pca_leiden.png differ diff --git a/outputs/SRR11940661/pca_pca_scatter.png b/outputs/SRR11940661/pca_pca_scatter.png new file mode 100644 index 0000000..6611101 Binary files /dev/null and b/outputs/SRR11940661/pca_pca_scatter.png differ diff --git a/outputs/SRR11940661/pca_variance_ratio_pca_variance.png b/outputs/SRR11940661/pca_variance_ratio_pca_variance.png new file mode 100644 index 0000000..7d64836 Binary files /dev/null and b/outputs/SRR11940661/pca_variance_ratio_pca_variance.png differ diff --git a/outputs/SRR11940661/rank_genes_groups_leiden_rank_genes.png b/outputs/SRR11940661/rank_genes_groups_leiden_rank_genes.png new file mode 100644 index 0000000..91fa717 Binary files /dev/null and b/outputs/SRR11940661/rank_genes_groups_leiden_rank_genes.png differ diff --git a/outputs/SRR11940661/top_genes_PC1.csv b/outputs/SRR11940661/top_genes_PC1.csv new file mode 100644 index 0000000..a725c68 --- /dev/null +++ b/outputs/SRR11940661/top_genes_PC1.csv @@ -0,0 +1,31 @@ +gene,loading,abs_loading +gene-U712_00580,0.11417147,0.11417147 +gene-U712_10210,0.107354835,0.107354835 +gene-U712_00585,0.1014146,0.1014146 +gene-U712_00700,0.08827221,0.08827221 +gene-U712_00735,0.08574556,0.08574556 +gene-U712_20290,0.08477078,0.08477078 +gene-U712_16965,0.08411337,0.08411337 +gene-U712_00695,0.08332969,0.08332969 +gene-U712_19840,0.08085494,0.08085494 +gene-U712_00625,0.07938439,0.07938439 +gene-U712_00705,0.07869947,0.07869947 +gene-U712_07665,0.07662746,0.07662746 +gene-U712_06215,0.074582025,0.074582025 +gene-U712_07310,0.07336663,0.07336663 +gene-U712_07675,0.07330899,0.07330899 +gene-U712_00560,0.07280758,0.07280758 +gene-U712_06220,0.071887605,0.071887605 +gene-U712_00600,0.07138487,0.07138487 +gene-U712_18050,0.07114354,0.07114354 +gene-U712_01810,0.0710372,0.0710372 +gene-U712_18700,0.07083048,0.07083048 +gene-U712_19250,0.06929057,0.06929057 +gene-U712_01805,0.06784177,0.06784177 +gene-U712_03020,0.067316405,0.067316405 +gene-U712_20285,0.06691975,0.06691975 +gene-U712_08455,0.0666639,0.0666639 +gene-U712_07680,0.06645966,0.06645966 +gene-U712_10205,0.065749034,0.065749034 +gene-U712_16300,0.065217495,0.065217495 +gene-U712_09415,0.064948425,0.064948425 diff --git a/outputs/SRR11940661/top_genes_PC2.csv b/outputs/SRR11940661/top_genes_PC2.csv new file mode 100644 index 0000000..da51394 --- /dev/null +++ b/outputs/SRR11940661/top_genes_PC2.csv @@ -0,0 +1,31 @@ +gene,loading,abs_loading +gene-U712_19480,0.1653905,0.1653905 +gene-U712_19490,0.15668231,0.15668231 +gene-U712_14360,0.14526144,0.14526144 +gene-U712_19485,0.14301682,0.14301682 +gene-U712_08455,0.1222755,0.1222755 +gene-U712_05870,0.12153367,0.12153367 +gene-U712_09470,0.11716664,0.11716664 +gene-U712_14420,0.114116274,0.114116274 +gene-U712_19495,0.11070127,0.11070127 +gene-U712_11735,0.105555564,0.105555564 +gene-U712_08460,0.103325754,0.103325754 +gene-U712_14415,0.10313875,0.10313875 +gene-U712_11725,0.10097674,0.10097674 +gene-U712_06215,-0.09902469,0.09902469 +gene-U712_06220,-0.09858269,0.09858269 +gene-U712_10210,0.09832671,0.09832671 +gene-U712_14425,0.09571376,0.09571376 +gene-U712_11720,0.095571786,0.095571786 +gene-U712_11730,0.093575805,0.093575805 +gene-U712_11740,0.0906311,0.0906311 +gene-U712_11715,0.08936489,0.08936489 +gene-U712_02265,0.08880372,0.08880372 +gene-U712_00600,-0.08737269,0.08737269 +gene-U712_15910,0.085302696,0.085302696 +gene-U712_01805,0.084308736,0.084308736 +gene-U712_20290,0.08309676,0.08309676 +gene-U712_19860,-0.08125018,0.08125018 +gene-U712_08905,0.08097693,0.08097693 +gene-U712_11710,0.08042957,0.08042957 +gene-U712_19855,-0.08037939,0.08037939 diff --git a/outputs/SRR11940661/umap_umap_leiden.png b/outputs/SRR11940661/umap_umap_leiden.png new file mode 100644 index 0000000..f18afd7 Binary files /dev/null and b/outputs/SRR11940661/umap_umap_leiden.png differ diff --git a/outputs/SRR11940661/umap_umap_qc.png b/outputs/SRR11940661/umap_umap_qc.png new file mode 100644 index 0000000..f699dbb Binary files /dev/null and b/outputs/SRR11940661/umap_umap_qc.png differ diff --git a/scripts/scrna_base.py b/scripts/scrna_base.py new file mode 100644 index 0000000..82e6ebe --- /dev/null +++ b/scripts/scrna_base.py @@ -0,0 +1,234 @@ +#!/usr/bin/env python3 +import os +from os.path import join +import argparse + +import numpy as np +import pandas as pd +import scipy.sparse as sp +import matplotlib +matplotlib.use("Agg") # ensure headless plotting +import matplotlib.pyplot as plt +import seaborn as sns + +from scipy.stats import median_abs_deviation +from BCBio import GFF +import scanpy as sc + + +# --------------------------- +# Helpers +# --------------------------- +def get_protein_id(feature): + try: + protein_id = feature.sub_features[0].qualifiers.get("protein_id") + return protein_id[0] if protein_id else "" + except Exception: + return "" + +def get_product_name(feature): + try: + product = feature.sub_features[0].qualifiers.get("product") + return product[0] if product else "" + except Exception: + return "" + +def parse_gff_build_maps(gff_file): + protein_ids, feature_ids, feature_types, product_names = [], [], [], [] + with open(gff_file) as in_handle: + for rec in GFF.parse(in_handle): + for feature in rec.features: + if feature.type == "gene": + protein_ids.append(get_protein_id(feature)) + product_names.append(get_product_name(feature)) + feature_ids.append(feature.qualifiers["ID"][0]) + feature_types.append(feature.qualifiers.get("gene_biotype", [""])[0]) + feature_types_map = dict(zip(feature_ids, feature_types)) + protein_ids_map = dict(zip(feature_ids, protein_ids)) + product_names_map = dict(zip(feature_ids, product_names)) + return feature_types_map, protein_ids_map, product_names_map + +def calculate_qc_metrics(adata): + sc.pp.calculate_qc_metrics( + adata, qc_vars=["rRNA", "tRNA"], inplace=True, percent_top=[20], log1p=True + ) + +def is_outlier(adata, metric: str, nmads: int): + M = adata.obs[metric] + outlier = (M < np.median(M) - nmads * median_abs_deviation(M)) | ( + np.median(M) + nmads * median_abs_deviation(M) < M + ) + return outlier + +def top_genes_for_pc(loadings_col, genes, n=30): + abs_vals = np.abs(loadings_col) + idx = np.argsort(abs_vals)[::-1][:n] + return pd.DataFrame({ + "gene": genes[idx], + "loading": loadings_col[idx], + "abs_loading": abs_vals[idx], + }) + + +# --------------------------- +# Main pipeline +# --------------------------- +def main(args): + os.makedirs(args.outdir, exist_ok=True) + sc.settings.figdir = args.outdir # where scanpy will save figures + sc.settings.autoshow = False + + # --- GFF annotations --- + feature_types_map, protein_ids_map, product_names_map = parse_gff_build_maps(args.gff) + + # --- Read data --- + adata = sc.read_10x_mtx(args.input, var_names="gene_ids", cache=True) + print(adata) + + # --- Annotate genes --- + adata.var["gene_type"] = adata.var_names.map(feature_types_map) + adata = adata[:, ~adata.var["gene_type"].isna()].copy() + adata.var["protein_id"] = adata.var_names.map(protein_ids_map) + adata.var["product_names"] = adata.var_names.map(product_names_map) + + # Mark rRNA and tRNA genes + adata.var["rRNA"] = adata.var["gene_type"].str.contains("rRNA", case=False, na=False) + adata.var["tRNA"] = adata.var["gene_type"].str.contains("tRNA", case=False, na=False) + + # --- QC (initial) --- + calculate_qc_metrics(adata) + print(f"Total number of cells (raw): {adata.n_obs}") + + # --- Filter low counts --- + adata_filter = adata.copy() + sc.pp.filter_cells(adata_filter, min_counts=args.min_counts) + print(f"Total number of cells (min_counts≥{args.min_counts}): {adata_filter.n_obs}") + + # --- Identify outliers --- + # These columns exist after calculate_qc_metrics(log1p=True, percent_top=[20]) + adata_filter.obs["outlier"] = ( + is_outlier(adata_filter, "log1p_total_counts", 5) + | is_outlier(adata_filter, "log1p_n_genes_by_counts", 5) + | is_outlier(adata_filter, "pct_counts_in_top_20_genes", 5) + | is_outlier(adata_filter, "pct_counts_rRNA", 5) + ) + print(adata_filter.obs["outlier"].value_counts()) + + # --- Remove outliers and tRNA-high cells (if available) --- + adata_filter = adata_filter[~adata_filter.obs["outlier"]].copy() + if "pct_counts_tRNA" in adata_filter.obs.columns: + adata_filter = adata_filter[adata_filter.obs["pct_counts_tRNA"] < 10].copy() + + # --- Recalculate QC metrics after filtering --- + calculate_qc_metrics(adata_filter) + + # --- Remove rRNA and tRNA genes from matrix (keeps obs QC cols intact) --- + adata_filter = adata_filter[:, ~(adata_filter.var["rRNA"] | adata_filter.var["tRNA"])].copy() + print(f"Final shape after filtering: {adata_filter.shape}") + + # ====================================================== + # Layers: counts, norm, log1p + # ====================================================== + adata_filter.layers["counts"] = adata_filter.X.copy() # raw counts layer + + norm_res = sc.pp.normalize_total(adata_filter, target_sum=args.target_sum, inplace=False) + adata_filter.layers["norm"] = norm_res["X"].copy() + + adata_filter.layers["log1p"] = np.log1p(adata_filter.layers["norm"]) + adata_filter.X = adata_filter.layers["log1p"].copy() # use for downstream + + # ====================================================== + # PCA & plots + # ====================================================== + # Scale (sparse-safe) + if sp.issparse(adata_filter.X): + sc.pp.scale(adata_filter, max_value=10, zero_center=False) + else: + sc.pp.scale(adata_filter, max_value=10) + + # PCA (match zero_center path) + if sp.issparse(adata_filter.X): + sc.tl.pca(adata_filter, svd_solver="arpack", zero_center=False) + else: + sc.tl.pca(adata_filter, svd_solver="arpack") + + # Save PCA variance ratio and PCA scatter + sc.pl.pca_variance_ratio(adata_filter, log=True, show=False, save="_pca_variance.png") + sc.pl.pca( + adata_filter, + color=["total_counts", "n_genes_by_counts", "pct_counts_rRNA", "pct_counts_tRNA"], + show=False, save="_pca_scatter.png" + ) + + # Identify genes that impact PC1/PC2 the most & save + loadings = adata_filter.varm["PCs"] # (n_genes, n_pcs) + genes = adata_filter.var_names + top_pc1 = top_genes_for_pc(loadings[:, 0], genes, n=args.top_loading_genes) + top_pc2 = top_genes_for_pc(loadings[:, 1], genes, n=args.top_loading_genes) + top_pc1.to_csv(join(args.outdir, "top_genes_PC1.csv"), index=False) + top_pc2.to_csv(join(args.outdir, "top_genes_PC2.csv"), index=False) + + # Bar plots for loadings + sc.pl.pca_loadings(adata_filter, components=[1, 2], n_points=args.top_loading_genes, + show=False, save="_pca_loadings_1_2.png") + + # ====================================================== + # Neighbors / UMAP / Leiden + # ====================================================== + sc.pp.neighbors(adata_filter, n_neighbors=args.n_neighbors, n_pcs=args.n_pcs) + sc.tl.umap(adata_filter) + sc.pl.umap( + adata_filter, + color=["total_counts", "n_genes_by_counts", "pct_counts_rRNA", "pct_counts_tRNA"], + show=False, save="_umap_qc.png" + ) + + sc.tl.leiden(adata_filter, resolution=args.resolution) + sc.pl.pca(adata_filter, color=["leiden"], show=False, save="_pca_leiden.png") + sc.pl.umap(adata_filter, color=["leiden"], legend_loc="on data", show=False, save="_umap_leiden.png") + + # ====================================================== + # Differential expression & heatmaps + # ====================================================== + sc.tl.rank_genes_groups(adata_filter, groupby="leiden", method="wilcoxon") + sc.pl.rank_genes_groups(adata_filter, n_genes=10, sharey=False, show=False, save="_rank_genes.png") + + # Dendrogram must match current leiden categories (recompute defensively) + adata_filter.obs["leiden"] = adata_filter.obs["leiden"].astype("category") + adata_filter.obs["leiden"] = adata_filter.obs["leiden"].cat.remove_unused_categories() + adata_filter.uns.pop("dendrogram_leiden", None) + sc.tl.dendrogram(adata_filter, groupby="leiden") + + sc.pl.rank_genes_groups_heatmap( + adata_filter, n_genes=5, groupby="leiden", show=False, show_gene_labels=True, + save="_rank_genes_heatmap.png" + ) + + # Also save the AnnData object + adata_out = join(args.outdir, "adata_filtered.h5ad") + adata_filter.write(adata_out) + print(f"\n✓ Done. Results saved in: {args.outdir}") + print(f" - AnnData: {adata_out}") + print(" - Figures: saved by Scanpy into that directory (filenames start with the suffixes given to `save=`).") + print(" - Tables: top_genes_PC1.csv, top_genes_PC2.csv") + + +# --------------------------- +# CLI +# --------------------------- +if __name__ == "__main__": + p = argparse.ArgumentParser( + description="Scanpy pipeline with GFF annotation, layers (counts/norm/log1p), PCA/UMAP/Leiden, and saved plots." + ) + p.add_argument("--input", required=True, + help="Path to 10X mtx directory (folder containing matrix.mtx, barcodes.tsv, features.tsv).") + p.add_argument("--gff", required=True, help="GFF annotation file.") + p.add_argument("--outdir", required=True, help="Output directory for plots, tables, and h5ad.") + p.add_argument("--min-counts", type=int, default=20, help="Minimum counts per cell for filtering.") + p.add_argument("--target-sum", type=float, default=1e4, help="Library size target for normalize_total.") + p.add_argument("--n-neighbors", type=int, default=10, help="Neighbors for graph construction.") + p.add_argument("--n-pcs", type=int, default=30, help="Number of PCs for neighbors/UMAP.") + p.add_argument("--resolution", type=float, default=1.0, help="Leiden resolution.") + p.add_argument("--top-loading-genes", type=int, default=30, help="Top genes to report for PC1/PC2 and loading bars.") + args = p.parse_args() + main(args)