From 11aa79d15e679647bcc2e6ff9e47f65842becd70 Mon Sep 17 00:00:00 2001 From: Shu-Feather <2200017797@stu.pku.edu.cn> Date: Wed, 25 Jun 2025 03:08:19 +0800 Subject: [PATCH 01/71] Mitigate ecoli/multigen parts from wcEcoli to vEcoli --- ecoli/analysis/multigeneration/replication.py | 178 +++++++++ .../multigeneration/ribosomeProduction.py | 344 +++++++++++++++++ .../analysis/multigeneration/ribosomeUsage.py | 364 ++++++++++++++++++ .../multigeneration/ribosome_components.py | 281 ++++++++++++++ .../multigeneration/ribosome_crowding.py | 255 ++++++++++++ .../multigeneration/rna_decay_03_high.py | 256 ++++++++++++ 6 files changed, 1678 insertions(+) create mode 100644 ecoli/analysis/multigeneration/replication.py create mode 100644 ecoli/analysis/multigeneration/ribosomeProduction.py create mode 100644 ecoli/analysis/multigeneration/ribosomeUsage.py create mode 100644 ecoli/analysis/multigeneration/ribosome_components.py create mode 100644 ecoli/analysis/multigeneration/ribosome_crowding.py create mode 100644 ecoli/analysis/multigeneration/rna_decay_03_high.py diff --git a/ecoli/analysis/multigeneration/replication.py b/ecoli/analysis/multigeneration/replication.py new file mode 100644 index 000000000..883f32f17 --- /dev/null +++ b/ecoli/analysis/multigeneration/replication.py @@ -0,0 +1,178 @@ +import altair as alt +import os +from typing import Any +import pickle + +from duckdb import DuckDBPyConnection +import polars as pl +import numpy as np + +from ecoli.library.parquet_emitter import ( + open_arbitrary_sim_data, + read_stacked_columns, +) + +CRITICAL_N = [1, 2, 4, 8] + + +def plot( + params: dict[str, Any], + conn: DuckDBPyConnection, + history_sql: str, + config_sql: str, + success_sql: str, + sim_data_dict: dict[str, dict[int, str]], + validation_data_paths: list[str], + outdir: str, + variant_metadata: dict[str, dict[int, Any]], + variant_names: dict[str, str], +): + # Load sim_data to get genome length + with open_arbitrary_sim_data(sim_data_dict) as f: + sim_data = pickle.load(f) + genome_length = len(sim_data.process.replication.genome_sequence) + + # Load all the required data + data_columns = [ + "time", + "listenersreplication_datanumberOfOric", + "listenersreplication_datacriticalInitiationMass", + "listenersreplication_datacriticalMassPerOriC", + "listenersreplication_datafork_coordinates", + "listenersmassdrymass", + "listenersmasscellmass", + ] + + plot_data = read_stacked_columns( + history_sql, + data_columns, + conn=conn, + ) + + # Convert to DataFrame and add time in hours + df = pl.DataFrame(plot_data).with_columns( + **{"Time (hr)": pl.col("time") / 3600} # Convert seconds to hours + ) + + # Calculate pairs of forks from fork coordinates + # Fork coordinates is a 2D array, count non-NaN values and divide by 2 + fork_coords = df["listenersreplication_datafork_coordinates"].to_numpy() + pairs_of_forks = [] + for coord_array in fork_coords: + if coord_array is not None and len(coord_array) > 0: + pairs_of_forks.append(np.sum(~np.isnan(coord_array)) / 2) + else: + pairs_of_forks.append(0) + + df = df.with_columns(pl.Series("pairs_of_forks", pairs_of_forks)) + + # Calculate critical mass equivalents + df = df.with_columns( + ( + pl.col("listenersmasscellmass") + / pl.col("listenersreplication_datacriticalInitiationMass") + ).alias("critical_mass_equivalents") + ) + + # Create individual plots + + # 1. Fork positions plot - this is complex due to the 2D nature, we'll create a simplified version + fork_positions_data = [] + for i, (time_val, coords) in enumerate(zip(df["Time (hr)"], fork_coords)): + if coords is not None and len(coords) > 0: + for coord in coords: + if not np.isnan(coord): + fork_positions_data.append( + {"Time (hr)": time_val, "Position": coord} + ) + + if fork_positions_data: + fork_df = pl.DataFrame(fork_positions_data) + fork_plot = ( + alt.Chart(fork_df) + .mark_circle(size=5) + .encode( + x=alt.X("Time (hr):Q"), + y=alt.Y( + "Position:Q", + scale=alt.Scale(domain=[-genome_length / 2, genome_length / 2]), + axis=alt.Axis( + values=[-genome_length / 2, 0, genome_length / 2], + labelExpr="datum.value == 0 ? 'oriC' : (datum.value < 0 ? '-terC' : '+terC')", + ), + title="DNA polymerase position (nt)", + ), + ) + .properties(title="DNA Polymerase Positions", width=600, height=100) + ) + else: + # Create empty plot if no fork data + fork_plot = ( + alt.Chart(pl.DataFrame({"x": [0], "y": [0]})) + .mark_text(text="No fork data available") + .encode(x="x:Q", y="y:Q") + .properties(width=600, height=100) + ) + + # 2. Pairs of forks plot + pairs_plot = df.plot.line( + x="Time (hr)", + y=alt.Y( + "pairs_of_forks", scale=alt.Scale(domain=[0, 6]), title="Pairs of forks" + ), + ).properties(title="Pairs of Replication Forks", width=600, height=100) + + # 3. Critical mass equivalents plot with reference lines + base_critical_plot = df.plot.line( + x="Time (hr)", + y=alt.Y( + "critical_mass_equivalents", title="Factors of critical initiation mass" + ), + ) + + # Add reference lines for critical N values + reference_lines = ( + alt.Chart( + pl.DataFrame({"y": CRITICAL_N, "label": [f"N={n}" for n in CRITICAL_N]}) + ) + .mark_rule(strokeDash=[5, 5], color="black") + .encode(y="y:Q") + ) + + critical_plot = (base_critical_plot + reference_lines).properties( + title="Factors of Critical Initiation Mass", width=600, height=100 + ) + + # 4. Dry mass plot + dry_mass_plot = df.plot.line( + x="Time (hr)", + y=alt.Y("listenersmassdryMass", title="Dry mass (fg)"), + ).properties(title="Dry Mass", width=600, height=100) + + # 5. Number of oriC plot + oric_plot = df.plot.line( + x="Time (hr)", + y=alt.Y("listenersreplication_datanumberOfOric", title="Number of oriC"), + ).properties(title="Number of oriC", width=600, height=100) + + # 6. Critical mass per oriC plot + mass_per_oric_plot = df.plot.line( + x="Time (hr)", + y=alt.Y( + "listenersreplication_datacriticalMassPerOriC", + title="Critical mass per oriC", + ), + ).properties(title="Critical Mass per oriC", width=600, height=100) + + # Combine all plots vertically + combined_plot = alt.vconcat( + fork_plot, + pairs_plot, + critical_plot, + dry_mass_plot, + oric_plot, + mass_per_oric_plot, + ).resolve_scale(x="shared") + + # Save the plot + combined_plot.save(os.path.join(outdir, "replication.html")) diff --git a/ecoli/analysis/multigeneration/ribosomeProduction.py b/ecoli/analysis/multigeneration/ribosomeProduction.py new file mode 100644 index 000000000..70635bf75 --- /dev/null +++ b/ecoli/analysis/multigeneration/ribosomeProduction.py @@ -0,0 +1,344 @@ +import altair as alt +import os +from typing import Any +import pickle +import polars as pl +import numpy as np +from duckdb import DuckDBPyConnection + +from ecoli.library.parquet_emitter import ( + field_metadata, + open_arbitrary_sim_data, +) + + +def plot( + params: dict[str, Any], + conn: DuckDBPyConnection, + history_sql: str, + config_sql: str, + success_sql: str, + sim_data_dict: dict[str, dict[int, str]], + validation_data_paths: list[str], + outdir: str, + variant_metadata: dict[str, dict[int, Any]], + variant_names: dict[str, str], +): + # Load sim_data + with open_arbitrary_sim_data(sim_data_dict) as f: + sim_data = pickle.load(f) + + # Get expected doubling time + expected_doubling_time = sim_data.doubling_time + + # Get ribosomal RNA IDs + ids_16s = [] + ids_16s.extend(sim_data.molecule_groups.s30_16s_rRNA) + ids_16s.append(sim_data.molecule_ids.s30_full_complex) + + ids_23s = [] + ids_23s.extend(sim_data.molecule_groups.s50_23s_rRNA) + ids_23s.append(sim_data.molecule_ids.s50_full_complex) + + ids_5s = [] + ids_5s.extend(sim_data.molecule_groups.s50_5s_rRNA) + ids_5s.append(sim_data.molecule_ids.s50_full_complex) + + # Get indices for ribosomal RNAs + bulk_molecule_ids = field_metadata(conn, config_sql, "listeners__bulk_molecules") + ids_16s_indexes = [ + bulk_molecule_ids.index(mol_id) + for mol_id in ids_16s + if mol_id in bulk_molecule_ids + ] + ids_23s_indexes = [ + bulk_molecule_ids.index(mol_id) + for mol_id in ids_23s + if mol_id in bulk_molecule_ids + ] + ids_5s_indexes = [ + bulk_molecule_ids.index(mol_id) + for mol_id in ids_5s + if mol_id in bulk_molecule_ids + ] + + # Get unique molecule index for active ribosome + unique_molecule_ids = field_metadata( + conn, config_sql, "listeners__unique_molecules" + ) + ribosome_index = ( + unique_molecule_ids.index("active_ribosome") + if "active_ribosome" in unique_molecule_ids + else None + ) + + # Get cistron indices for rRNA + cistron_ids = [ + cistron["id"] for cistron in sim_data.process.transcription.cistron_data + ] + + idx_16s = [] + for id16s in sim_data.molecule_groups.s30_16s_rRNA: + cistron_id = id16s[:-3] # Remove _rna suffix + if cistron_id in cistron_ids: + idx_16s.append(cistron_ids.index(cistron_id)) + + idx_23s = [] + for id23s in sim_data.molecule_groups.s50_23s_rRNA: + cistron_id = id23s[:-3] # Remove _rna suffix + if cistron_id in cistron_ids: + idx_23s.append(cistron_ids.index(cistron_id)) + + idx_5s = [] + for id5s in sim_data.molecule_groups.s50_5s_rRNA: + cistron_id = id5s[:-3] # Remove _rna suffix + if cistron_id in cistron_ids: + idx_5s.append(cistron_ids.index(cistron_id)) + + # Calculate expected initiation probabilities + condition = sim_data.condition + cistron_synth_prob = sim_data.process.transcription.cistron_tu_mapping_matrix.dot( + sim_data.process.transcription.rna_synth_prob[condition] + ) + + rrn16s_fit_init_prob = cistron_synth_prob[idx_16s].sum() if idx_16s else 0 + rrn23s_fit_init_prob = cistron_synth_prob[idx_23s].sum() if idx_23s else 0 + rrn5s_fit_init_prob = cistron_synth_prob[idx_5s].sum() if idx_5s else 0 + + # Define columns to read + columns_to_read = [ + "time", + "listeners__mass__instantaneous_growth_rate", + "listeners__main__timeStepSec", + "listeners__ribosome_data__rRNA16S_initiated", + "listeners__ribosome_data__rRNA23S_initiated", + "listeners__ribosome_data__rRNA5S_initiated", + "listeners__ribosome_data__rRNA16S_init_prob", + "listeners__ribosome_data__rRNA23S_init_prob", + "listeners__ribosome_data__rRNA5S_init_prob", + "listeners__ribosome_data__total_rna_init", + "listeners__ribosome_data__effectiveElongationRate", + ] + + # Add bulk molecule columns + if ids_16s_indexes: + for idx in ids_16s_indexes: + columns_to_read.append(f"listeners__bulk_molecules__{idx}") + if ids_23s_indexes: + for idx in ids_23s_indexes: + columns_to_read.append(f"listeners__bulk_molecules__{idx}") + if ids_5s_indexes: + for idx in ids_5s_indexes: + columns_to_read.append(f"listeners__bulk_molecules__{idx}") + + # Add unique molecule column + if ribosome_index is not None: + columns_to_read.append(f"listeners__unique_molecules__{ribosome_index}") + + # Read data + data_df = conn.execute(f""" + SELECT {", ".join(columns_to_read)} + FROM ({history_sql}) + ORDER BY variant_idx, generation, agent_id, time + """).pl() + + # Group by first cell of each generation (assuming agent_id=0 is first cell) + first_cell_data = data_df.filter(pl.col("agent_id") == 0) + + # Calculate derived metrics + time_min = first_cell_data["time"] / 60 + + # Growth rate and doubling time + growth_rate = first_cell_data["listeners__mass__instantaneous_growth_rate"] + doubling_time = np.log(2) / growth_rate + + # Calculate rRNA counts + rrn16s_bulk = ( + sum( + [ + first_cell_data[f"listeners__bulk_molecules__{idx}"] + for idx in ids_16s_indexes + ] + ) + if ids_16s_indexes + else pl.lit(0) + ) + rrn23s_bulk = ( + sum( + [ + first_cell_data[f"listeners__bulk_molecules__{idx}"] + for idx in ids_23s_indexes + ] + ) + if ids_23s_indexes + else pl.lit(0) + ) + rrn5s_bulk = ( + sum( + [ + first_cell_data[f"listeners__bulk_molecules__{idx}"] + for idx in ids_5s_indexes + ] + ) + if ids_5s_indexes + else pl.lit(0) + ) + + if ribosome_index is not None: + ribosome_count = first_cell_data[ + f"listeners__unique_molecules__{ribosome_index}" + ] + rrn16s_count = rrn16s_bulk + ribosome_count + rrn23s_count = rrn23s_bulk + ribosome_count + rrn5s_count = rrn5s_bulk + ribosome_count + else: + rrn16s_count = rrn16s_bulk + rrn23s_count = rrn23s_bulk + rrn5s_count = rrn5s_bulk + + # Calculate rRNA doubling times + time_step = first_cell_data["listeners__main__timeStepSec"] + + rrn16s_produced = first_cell_data["listeners__ribosome_data__rRNA16S_initiated"] + rrn23s_produced = first_cell_data["listeners__ribosome_data__rRNA23S_initiated"] + rrn5s_produced = first_cell_data["listeners__ribosome_data__rRNA5S_initiated"] + + # Avoid division by zero + rrn16s_doubling_time = ( + pl.when(rrn16s_produced > 0) + .then(np.log(2) / ((1 / time_step) * (rrn16s_produced / rrn16s_count))) + .otherwise(None) + / 60 + ) # Convert to minutes + + rrn23s_doubling_time = ( + pl.when(rrn23s_produced > 0) + .then(np.log(2) / ((1 / time_step) * (rrn23s_produced / rrn23s_count))) + .otherwise(None) + / 60 + ) + + rrn5s_doubling_time = ( + pl.when(rrn5s_produced > 0) + .then(np.log(2) / ((1 / time_step) * (rrn5s_produced / rrn5s_count))) + .otherwise(None) + / 60 + ) + + # Prepare plotting dataframe + plot_data = first_cell_data.with_columns( + [ + time_min.alias("Time (min)"), + doubling_time.alias("Doubling Time (min)"), + rrn16s_doubling_time.alias("16S Doubling Time (min)"), + rrn23s_doubling_time.alias("23S Doubling Time (min)"), + rrn5s_doubling_time.alias("5S Doubling Time (min)"), + ( + first_cell_data["listeners__ribosome_data__rRNA16S_init_prob"] + / first_cell_data["listeners__ribosome_data__total_rna_init"] + ).alias("16S Init Prob"), + ( + first_cell_data["listeners__ribosome_data__rRNA23S_init_prob"] + / first_cell_data["listeners__ribosome_data__total_rna_init"] + ).alias("23S Init Prob"), + ( + first_cell_data["listeners__ribosome_data__rRNA5S_init_prob"] + / first_cell_data["listeners__ribosome_data__total_rna_init"] + ).alias("5S Init Prob"), + first_cell_data["listeners__ribosome_data__effectiveElongationRate"].alias( + "Elongation Rate (aa/s)" + ), + pl.lit(expected_doubling_time.as_number() / 60).alias( + "Expected Doubling Time (min)" + ), + pl.lit(rrn16s_fit_init_prob).alias("Expected 16S Init Prob"), + pl.lit(rrn23s_fit_init_prob).alias("Expected 23S Init Prob"), + pl.lit(rrn5s_fit_init_prob).alias("Expected 5S Init Prob"), + ] + ) + + # Create plots + base = alt.Chart(plot_data).add_selection(alt.selection_interval(bind="scales")) + + # Doubling time plot + doubling_plot = base.mark_line(color="blue").encode( + x=alt.X("Time (min):Q"), + y=alt.Y("Doubling Time (min):Q", title="Doubling Time (min)"), + ) + base.mark_line(color="red", strokeDash=[5, 5]).encode( + x=alt.X("Time (min):Q"), y=alt.Y("Expected Doubling Time (min):Q") + ) + + # 16S doubling time plot + rrn16s_plot = base.mark_line(color="blue").encode( + x=alt.X("Time (min):Q"), + y=alt.Y("16S Doubling Time (min):Q", title="16S Doubling Time (min)"), + ) + base.mark_line(color="red", strokeDash=[5, 5]).encode( + x=alt.X("Time (min):Q"), y=alt.Y("Expected Doubling Time (min):Q") + ) + + # 23S doubling time plot + rrn23s_plot = base.mark_line(color="blue").encode( + x=alt.X("Time (min):Q"), + y=alt.Y("23S Doubling Time (min):Q", title="23S Doubling Time (min)"), + ) + base.mark_line(color="red", strokeDash=[5, 5]).encode( + x=alt.X("Time (min):Q"), y=alt.Y("Expected Doubling Time (min):Q") + ) + + # 5S doubling time plot + rrn5s_plot = base.mark_line(color="blue").encode( + x=alt.X("Time (min):Q"), + y=alt.Y("5S Doubling Time (min):Q", title="5S Doubling Time (min)"), + ) + base.mark_line(color="red", strokeDash=[5, 5]).encode( + x=alt.X("Time (min):Q"), y=alt.Y("Expected Doubling Time (min):Q") + ) + + # Initiation probability plots + init_16s_plot = base.mark_line(color="blue").encode( + x=alt.X("Time (min):Q"), y=alt.Y("16S Init Prob:Q", title="16S Init Prob") + ) + base.mark_line(color="red", strokeDash=[5, 5]).encode( + x=alt.X("Time (min):Q"), y=alt.Y("Expected 16S Init Prob:Q") + ) + + init_23s_plot = base.mark_line(color="blue").encode( + x=alt.X("Time (min):Q"), y=alt.Y("23S Init Prob:Q", title="23S Init Prob") + ) + base.mark_line(color="red", strokeDash=[5, 5]).encode( + x=alt.X("Time (min):Q"), y=alt.Y("Expected 23S Init Prob:Q") + ) + + init_5s_plot = base.mark_line(color="blue").encode( + x=alt.X("Time (min):Q"), y=alt.Y("5S Init Prob:Q", title="5S Init Prob") + ) + base.mark_line(color="red", strokeDash=[5, 5]).encode( + x=alt.X("Time (min):Q"), y=alt.Y("Expected 5S Init Prob:Q") + ) + + # Elongation rate plot + elongation_plot = base.mark_line(color="blue").encode( + x=alt.X("Time (min):Q"), + y=alt.Y( + "Elongation Rate (aa/s):Q", title="Average Ribosome Elongation Rate (aa/s)" + ), + ) + + # Combine all plots vertically + combined_plot = alt.vconcat( + doubling_plot.properties(title="Cell Doubling Time", width=600, height=100), + rrn16s_plot.properties(title="16S rRNA Doubling Time", width=600, height=100), + rrn23s_plot.properties(title="23S rRNA Doubling Time", width=600, height=100), + rrn5s_plot.properties(title="5S rRNA Doubling Time", width=600, height=100), + init_16s_plot.properties( + title="16S rRNA Initiation Probability", width=600, height=100 + ), + init_23s_plot.properties( + title="23S rRNA Initiation Probability", width=600, height=100 + ), + init_5s_plot.properties( + title="5S rRNA Initiation Probability", width=600, height=100 + ), + elongation_plot.properties( + title="Ribosome Elongation Rate", width=600, height=100 + ), + resolve=alt.Resolve(scale=alt.ScaleResolve(y="independent")), + ) + + # Save the plot + combined_plot.save(os.path.join(outdir, "ribosome_production.html")) diff --git a/ecoli/analysis/multigeneration/ribosomeUsage.py b/ecoli/analysis/multigeneration/ribosomeUsage.py new file mode 100644 index 000000000..7167ab900 --- /dev/null +++ b/ecoli/analysis/multigeneration/ribosomeUsage.py @@ -0,0 +1,364 @@ +import altair as alt +import os +from typing import Any + +from duckdb import DuckDBPyConnection +import pickle +import polars as pl + +from ecoli.library.parquet_emitter import ( + field_metadata, + open_arbitrary_sim_data, + read_stacked_columns, +) + + +def plot( + params: dict[str, Any], + conn: DuckDBPyConnection, + history_sql: str, + config_sql: str, + success_sql: str, + sim_data_dict: dict[str, dict[int, str]], + validation_data_paths: list[str], + outdir: str, + variant_metadata: dict[str, dict[int, Any]], + variant_names: dict[str, str], +): + # Get sim data from pickle file + with open_arbitrary_sim_data(sim_data_dict) as f: + sim_data = pickle.load(f) + + # Get ids for 30S and 50S subunits + complexIds30S = [sim_data.molecule_ids.s30_full_complex] + complexIds50S = [sim_data.molecule_ids.s50_full_complex] + + # Get molecular weights for 30S and 50S subunits, and add these two for 70S + nAvogadro = sim_data.constants.n_avogadro + mw30S = sim_data.getter.get_masses(complexIds30S) + mw50S = sim_data.getter.get_masses(complexIds50S) + mw70S = mw30S + mw50S + + # Load data + bulk_molecule_data = read_stacked_columns( + history_sql, + [ + "listeners__bulk_molecules__counts", + "listeners__unique_molecule_counts__unique_molecule_counts", + "listeners__ribosome_data__did_initialize", + "listeners__ribosome_data__actual_elongations", + "listeners__ribosome_data__did_terminate", + "listeners__ribosome_data__effective_elongation_rate", + "listeners__mass__cell_mass", + "time", + "time_step_sec", + ], + conn=conn, + ) + + # Convert to DataFrame + df = pl.DataFrame(bulk_molecule_data).with_columns( + **{ + "Time (min)": pl.col("time") / 60, + "Cell Volume (L)": (pl.col("listeners__mass__cell_mass") * 1e-15) + / sim_data.constants.cell_density, + } + ) + + # Get indexes for 30S and 50S subunits based on ids + bulk_molecule_ids = field_metadata( + conn, config_sql, "listeners__bulk_molecules__counts" + ) + complexIndexes30S = [bulk_molecule_ids.index(comp) for comp in complexIds30S] + complexIndexes50S = [bulk_molecule_ids.index(comp) for comp in complexIds50S] + + # Get indexes for active ribosomes + unique_molecule_ids = field_metadata( + conn, config_sql, "listeners__unique_molecule_counts__unique_molecule_counts" + ) + ribosomeIndex = unique_molecule_ids.index("active_ribosome") + + # Extract specific columns from arrays + df = df.with_columns( + [ + pl.col("listeners__bulk_molecules__counts") + .list.get(complexIndexes30S[0]) + .alias("counts_30S"), + pl.col("listeners__bulk_molecules__counts") + .list.get(complexIndexes50S[0]) + .alias("counts_50S"), + pl.col("listeners__unique_molecule_counts__unique_molecule_counts") + .list.get(ribosomeIndex) + .alias("active_ribosome_counts"), + ] + ) + + # Calculate ribosome statistics + df = df.with_columns( + [ + # Total ribosome counts + ( + pl.col("active_ribosome_counts") + + pl.min_horizontal([pl.col("counts_30S"), pl.col("counts_50S")]) + ).alias("total_ribosome_counts"), + # Concentrations + ( + (1 / nAvogadro) + * pl.col("active_ribosome_counts") + / pl.col("Cell Volume (L)") + ).alias("active_ribosome_concentration_M"), + # Masses + ((1 / nAvogadro) * pl.col("counts_30S") * mw30S).alias("mass_30S"), + ((1 / nAvogadro) * pl.col("counts_50S") * mw50S).alias("mass_50S"), + ((1 / nAvogadro) * pl.col("active_ribosome_counts") * mw70S).alias( + "active_ribosome_mass" + ), + # Rates per time*volume + ( + pl.col("listeners__ribosome_data__did_initialize") + / (pl.col("time_step_sec") * pl.col("Cell Volume (L)")) + ).alias("activations_per_time_volume"), + ( + pl.col("listeners__ribosome_data__did_terminate") + / (pl.col("time_step_sec") * pl.col("Cell Volume (L)")) + ).alias("deactivations_per_time_volume"), + ] + ) + + # Calculate additional derived columns + df = df.with_columns( + [ + # Total ribosome concentration + ( + (1 / nAvogadro) + * pl.col("total_ribosome_counts") + / pl.col("Cell Volume (L)") + ).alias("total_ribosome_concentration_M"), + # Molar fraction active + ( + pl.col("active_ribosome_counts").cast(pl.Float64) + / pl.col("total_ribosome_counts") + ).alias("molar_fraction_active"), + # Total ribosome mass and mass fraction + ( + pl.col("active_ribosome_mass") + pl.col("mass_30S") + pl.col("mass_50S") + ).alias("total_ribosome_mass"), + ] + ) + + # Calculate mass fraction active + df = df.with_columns( + [ + (pl.col("active_ribosome_mass") / pl.col("total_ribosome_mass")).alias( + "mass_fraction_active" + ), + ] + ) + + # Convert concentrations to mM + df = df.with_columns( + [ + (pl.col("active_ribosome_concentration_M") * 1000).alias( + "active_ribosome_concentration_mM" + ), + (pl.col("total_ribosome_concentration_M") * 1000).alias( + "total_ribosome_concentration_mM" + ), + ] + ) + + # Create individual plots + plots = [] + + # Time step plot + timestep_plot = ( + alt.Chart(df.to_pandas()) + .mark_line() + .encode( + x=alt.X("Time (min):Q", title="Time (min)"), + y=alt.Y("time_step_sec:Q", title="Length of time step (s)"), + ) + .properties(title="Time Step", width=300, height=150) + ) + plots.append(timestep_plot) + + # Cell volume plot + volume_plot = ( + alt.Chart(df.to_pandas()) + .mark_line() + .encode( + x=alt.X("Time (min):Q", title="Time (min)"), + y=alt.Y("Cell Volume (L):Q", title="Cell volume (L)"), + ) + .properties(title="Cell Volume", width=300, height=150) + ) + plots.append(volume_plot) + + # Total ribosome counts + total_counts_plot = ( + alt.Chart(df.to_pandas()) + .mark_line() + .encode( + x=alt.X("Time (min):Q", title="Time (min)"), + y=alt.Y("total_ribosome_counts:Q", title="Total ribosome count"), + ) + .properties(title="Total Ribosome Count", width=300, height=150) + ) + plots.append(total_counts_plot) + + # Total ribosome concentration + total_conc_plot = ( + alt.Chart(df.to_pandas()) + .mark_line() + .encode( + x=alt.X("Time (min):Q", title="Time (min)"), + y=alt.Y("total_ribosome_concentration_mM:Q", title="[Total ribosome] (mM)"), + ) + .properties(title="Total Ribosome Concentration", width=300, height=150) + ) + plots.append(total_conc_plot) + + # Active ribosome counts + active_counts_plot = ( + alt.Chart(df.to_pandas()) + .mark_line() + .encode( + x=alt.X("Time (min):Q", title="Time (min)"), + y=alt.Y("active_ribosome_counts:Q", title="Active ribosome count"), + ) + .properties(title="Active Ribosome Count", width=300, height=150) + ) + plots.append(active_counts_plot) + + # Active ribosome concentration + active_conc_plot = ( + alt.Chart(df.to_pandas()) + .mark_line() + .encode( + x=alt.X("Time (min):Q", title="Time (min)"), + y=alt.Y( + "active_ribosome_concentration_mM:Q", title="[Active ribosome] (mM)" + ), + ) + .properties(title="Active Ribosome Concentration", width=300, height=150) + ) + plots.append(active_conc_plot) + + # Molar fraction active + molar_fraction_plot = ( + alt.Chart(df.to_pandas()) + .mark_line() + .encode( + x=alt.X("Time (min):Q", title="Time (min)"), + y=alt.Y("molar_fraction_active:Q", title="Molar fraction active ribosomes"), + ) + .properties(title="Molar Fraction Active Ribosomes", width=300, height=150) + ) + plots.append(molar_fraction_plot) + + # Mass fraction active + mass_fraction_plot = ( + alt.Chart(df.to_pandas()) + .mark_line() + .encode( + x=alt.X("Time (min):Q", title="Time (min)"), + y=alt.Y("mass_fraction_active:Q", title="Mass fraction active ribosomes"), + ) + .properties(title="Mass Fraction Active Ribosomes", width=300, height=150) + ) + plots.append(mass_fraction_plot) + + # Activations per timestep + activations_plot = ( + alt.Chart(df.to_pandas()) + .mark_line() + .encode( + x=alt.X("Time (min):Q", title="Time (min)"), + y=alt.Y( + "listeners__ribosome_data__did_initialize:Q", + title="Activations per timestep", + ), + ) + .properties(title="Activations per Timestep", width=300, height=150) + ) + plots.append(activations_plot) + + # Deactivations per timestep + deactivations_plot = ( + alt.Chart(df.to_pandas()) + .mark_line() + .encode( + x=alt.X("Time (min):Q", title="Time (min)"), + y=alt.Y( + "listeners__ribosome_data__did_terminate:Q", + title="Deactivations per timestep", + ), + ) + .properties(title="Deactivations per Timestep", width=300, height=150) + ) + plots.append(deactivations_plot) + + # Activations per time*volume + activations_tv_plot = ( + alt.Chart(df.to_pandas()) + .mark_line() + .encode( + x=alt.X("Time (min):Q", title="Time (min)"), + y=alt.Y( + "activations_per_time_volume:Q", title="Activations per time*volume" + ), + ) + .properties(title="Activations per Time*Volume", width=300, height=150) + ) + plots.append(activations_tv_plot) + + # Deactivations per time*volume + deactivations_tv_plot = ( + alt.Chart(df.to_pandas()) + .mark_line() + .encode( + x=alt.X("Time (min):Q", title="Time (min)"), + y=alt.Y( + "deactivations_per_time_volume:Q", title="Deactivations per time*volume" + ), + ) + .properties(title="Deactivations per Time*Volume", width=300, height=150) + ) + plots.append(deactivations_tv_plot) + + # AA translated + aa_plot = ( + alt.Chart(df.to_pandas()) + .mark_line() + .encode( + x=alt.X("Time (min):Q", title="Time (min)"), + y=alt.Y( + "listeners__ribosome_data__actual_elongations:Q", title="AA translated" + ), + ) + .properties(title="Amino Acids Translated", width=300, height=150) + ) + plots.append(aa_plot) + + # Effective elongation rate + elongation_plot = ( + alt.Chart(df.to_pandas()) + .mark_line() + .encode( + x=alt.X("Time (min):Q", title="Time (min)"), + y=alt.Y( + "listeners__ribosome_data__effective_elongation_rate:Q", + title="Effective elongation rate", + ), + ) + .properties(title="Effective Elongation Rate", width=300, height=150) + ) + plots.append(elongation_plot) + + # Combine all plots in a grid layout (7 rows, 2 columns) + left_column = alt.vconcat(*plots[:7]) + right_column = alt.vconcat(*plots[7:]) + combined_plot = alt.hconcat(left_column, right_column) + + # Save the plot + combined_plot.save(os.path.join(outdir, "ribosome_usage.html")) diff --git a/ecoli/analysis/multigeneration/ribosome_components.py b/ecoli/analysis/multigeneration/ribosome_components.py new file mode 100644 index 000000000..bf1ba6321 --- /dev/null +++ b/ecoli/analysis/multigeneration/ribosome_components.py @@ -0,0 +1,281 @@ +import altair as alt +import os +from typing import Any + +from duckdb import DuckDBPyConnection +import pickle +import polars as pl +import numpy as np + +from ecoli.library.parquet_emitter import ( + field_metadata, + open_arbitrary_sim_data, + named_idx, + read_stacked_columns, +) + + +def plot( + params: dict[str, Any], + conn: DuckDBPyConnection, + history_sql: str, + config_sql: str, + success_sql: str, + sim_data_dict: dict[str, dict[int, str]], + validation_data_paths: list[str], + outdir: str, + variant_metadata: dict[str, dict[int, Any]], + variant_names: dict[str, str], +): + """ + Plots the timetrace of counts for each of the components of the ribosomal + subunits (rRNAs and ribosomal proteins). + """ + + # Load sim_data + with open_arbitrary_sim_data(sim_data_dict) as f: + sim_data = pickle.load(f) + + # Load IDs of ribosome components from sim_data + s30_protein_ids = sim_data.molecule_groups.s30_proteins + s30_16s_rRNA_ids = sim_data.molecule_groups.s30_16s_rRNA + s30_full_complex_id = [sim_data.molecule_ids.s30_full_complex] + s50_protein_ids = sim_data.molecule_groups.s50_proteins + s50_23s_rRNA_ids = sim_data.molecule_groups.s50_23s_rRNA + s50_5s_rRNA_ids = sim_data.molecule_groups.s50_5s_rRNA + s50_full_complex_id = [sim_data.molecule_ids.s50_full_complex] + + # Get complexation stoichiometries of ribosomal proteins + complexation = sim_data.process.complexation + s30_monomers = complexation.get_monomers(s30_full_complex_id[0]) + s50_monomers = complexation.get_monomers(s50_full_complex_id[0]) + s30_subunit_id_to_stoich = { + subunit_id: stoich + for (subunit_id, stoich) in zip( + s30_monomers["subunitIds"], s30_monomers["subunitStoich"] + ) + } + s50_subunit_id_to_stoich = { + subunit_id: stoich + for (subunit_id, stoich) in zip( + s50_monomers["subunitIds"], s50_monomers["subunitStoich"] + ) + } + s30_protein_stoich = np.array( + [s30_subunit_id_to_stoich[subunit_id] for subunit_id in s30_protein_ids] + ) + s50_protein_stoich = np.array( + [s50_subunit_id_to_stoich[subunit_id] for subunit_id in s50_protein_ids] + ) + + # Get metadata for extracting indexes + unique_molecule_metadata = field_metadata( + conn, config_sql, "listeners__unique_molecule_counts" + ) + monomer_metadata = field_metadata(conn, config_sql, "listeners__monomer_counts") + + # Extract indexes + active_ribosome_index = unique_molecule_metadata.index("active_ribosome") + + monomer_id_to_index = { + monomer_id: i for (i, monomer_id) in enumerate(monomer_metadata) + } + s30_protein_indexes = [ + monomer_id_to_index[protein_id] for protein_id in s30_protein_ids + ] + s50_protein_indexes = [ + monomer_id_to_index[protein_id] for protein_id in s50_protein_ids + ] + + # Define named indexes for bulk molecules + s30_16s_rRNAs = named_idx( + "listeners__bulk_molecules", + s30_16s_rRNA_ids, + list(range(len(s30_16s_rRNA_ids))), # Will be resolved by named_idx + ) + s50_23s_rRNAs = named_idx( + "listeners__bulk_molecules", + s50_23s_rRNA_ids, + list(range(len(s50_23s_rRNA_ids))), + ) + s50_5s_rRNAs = named_idx( + "listeners__bulk_molecules", s50_5s_rRNA_ids, list(range(len(s50_5s_rRNA_ids))) + ) + s30_full_complex = named_idx("listeners__bulk_molecules", s30_full_complex_id, [0]) + s50_full_complex = named_idx("listeners__bulk_molecules", s50_full_complex_id, [0]) + + # Named indexes for monomer counts + s30_proteins = named_idx( + "listeners__monomer_counts", s30_protein_ids, s30_protein_indexes + ) + s50_proteins = named_idx( + "listeners__monomer_counts", s50_protein_ids, s50_protein_indexes + ) + + # Named index for active ribosomes + active_ribosomes = named_idx( + "listeners__unique_molecule_counts", + ["active_ribosome"], + [active_ribosome_index], + ) + + # Load data + ribosome_data = read_stacked_columns( + history_sql, + [ + s30_16s_rRNAs, + s50_23s_rRNAs, + s50_5s_rRNAs, + s30_full_complex, + s50_full_complex, + s30_proteins, + s50_proteins, + active_ribosomes, + ], + conn=conn, + ) + + # Convert to DataFrame and add time column + df = pl.DataFrame(ribosome_data).with_columns(**{"Time (min)": pl.col("time") / 60}) + + # Calculate protein counts divided by stoichiometry + s30_protein_counts_cols = [] + for i, protein_id in enumerate(s30_protein_ids): + col_name = f"s30_protein_{i}_normalized" + df = df.with_columns( + (pl.col(protein_id) / s30_protein_stoich[i]).alias(col_name) + ) + s30_protein_counts_cols.append(col_name) + + s50_protein_counts_cols = [] + for i, protein_id in enumerate(s50_protein_ids): + col_name = f"s50_protein_{i}_normalized" + df = df.with_columns( + (pl.col(protein_id) / s50_protein_stoich[i]).alias(col_name) + ) + s50_protein_counts_cols.append(col_name) + + # Calculate limiting protein counts and total counts + df = df.with_columns( + [ + # S30 calculations + pl.min_horizontal(s30_protein_counts_cols).alias( + "s30_limiting_protein_counts" + ), + ( + pl.sum_horizontal(s30_16s_rRNA_ids) + + pl.col(s30_full_complex_id[0]) + + pl.col("active_ribosome") + ).alias("s30_16s_rRNA_total_counts"), + (pl.col(s30_full_complex_id[0]) + pl.col("active_ribosome")).alias( + "s30_total_counts" + ), + # S50 calculations + pl.min_horizontal(s50_protein_counts_cols).alias( + "s50_limiting_protein_counts" + ), + ( + pl.sum_horizontal(s50_23s_rRNA_ids) + + pl.col(s50_full_complex_id[0]) + + pl.col("active_ribosome") + ).alias("s50_23s_rRNA_total_counts"), + ( + pl.sum_horizontal(s50_5s_rRNA_ids) + + pl.col(s50_full_complex_id[0]) + + pl.col("active_ribosome") + ).alias("s50_5s_rRNA_total_counts"), + (pl.col(s50_full_complex_id[0]) + pl.col("active_ribosome")).alias( + "s50_total_counts" + ), + ] + ) + + # Create plots + # 30S components plot + s30_plot = ( + alt.Chart(df) + .mark_line() + .encode( + x=alt.X("Time (min):Q").title("Time (min)"), + y=alt.Y("value:Q").title("30S component counts").scale(domain=[0, 60000]), + color=alt.Color("variable:N").title("Component"), + strokeDash=alt.StrokeDash("variable:N").scale( + domain=[ + "s30_limiting_protein_counts", + "s30_16s_rRNA_total_counts", + "s30_total_counts", + ], + range=[[5, 5], [0], [3, 3]], + ), + ) + .transform_fold( + [ + "s30_limiting_protein_counts", + "s30_16s_rRNA_total_counts", + "s30_total_counts", + ], + as_=["variable", "value"], + ) + .transform_calculate( + variable_label="datum.variable === 's30_limiting_protein_counts' ? 'limiting r-protein' : " + "datum.variable === 's30_16s_rRNA_total_counts' ? '16S rRNA' : '30S subunit'" + ) + .encode( + color=alt.Color("variable_label:N") + .title("Component") + .scale( + domain=["limiting r-protein", "16S rRNA", "30S subunit"], + range=["#cccccc", "#1f77b4", "#000000"], + ) + ) + .properties(title="30S Ribosomal Subunit Components", width=600, height=200) + ) + + # 50S components plot + s50_plot = ( + alt.Chart(df) + .mark_line() + .encode( + x=alt.X("Time (min):Q").title("Time (min)"), + y=alt.Y("value:Q").title("50S component counts").scale(domain=[0, 60000]), + color=alt.Color("variable:N").title("Component"), + strokeDash=alt.StrokeDash("variable:N").scale( + domain=[ + "s50_limiting_protein_counts", + "s50_23s_rRNA_total_counts", + "s50_5s_rRNA_total_counts", + "s50_total_counts", + ], + range=[[5, 5], [0], [0], [3, 3]], + ), + ) + .transform_fold( + [ + "s50_limiting_protein_counts", + "s50_23s_rRNA_total_counts", + "s50_5s_rRNA_total_counts", + "s50_total_counts", + ], + as_=["variable", "value"], + ) + .transform_calculate( + variable_label="datum.variable === 's50_limiting_protein_counts' ? 'limiting r-protein' : " + "datum.variable === 's50_23s_rRNA_total_counts' ? '23S rRNA' : " + "datum.variable === 's50_5s_rRNA_total_counts' ? '5S rRNA' : '50S subunit'" + ) + .encode( + color=alt.Color("variable_label:N") + .title("Component") + .scale( + domain=["limiting r-protein", "23S rRNA", "5S rRNA", "50S subunit"], + range=["#cccccc", "#ff7f0e", "#2ca02c", "#000000"], + ) + ) + .properties(title="50S Ribosomal Subunit Components", width=600, height=200) + ) + + # Combine plots vertically + combined_plot = alt.vconcat(s30_plot, s50_plot).resolve_scale(color="independent") + + # Save the plot + combined_plot.save(os.path.join(outdir, "ribosome_components.html")) diff --git a/ecoli/analysis/multigeneration/ribosome_crowding.py b/ecoli/analysis/multigeneration/ribosome_crowding.py new file mode 100644 index 000000000..08d88ec8b --- /dev/null +++ b/ecoli/analysis/multigeneration/ribosome_crowding.py @@ -0,0 +1,255 @@ +import os +import pickle + +from typing import Any + +import altair as alt +import polars as pl +import numpy as np +from duckdb import DuckDBPyConnection + +from ecoli.library.parquet_emitter import ( + open_arbitrary_sim_data, + field_metadata, + named_idx, + read_stacked_columns, +) + +# Maximum number of overcrowded proteins to plot +MAX_NUMBER_OF_MONOMERS_TO_PLOT = 300 + + +def plot( + params: dict[str, Any], + conn: DuckDBPyConnection, + history_sql: str, + config_sql: str, + success_sql: str, + sim_data_dict: dict[str, dict[int, str]], + validation_data_paths: list[str], + outdir: str, + variant_metadata: dict[str, dict[int, Any]], + variant_names: dict[str, str], +): + """ + Compare target vs actual translation probabilities for mRNAs + whose translation probabilities were limited by ribosome crowding. + """ + + # 1. Load sim_data from arbitrary source (as in new_gene example) + with open_arbitrary_sim_data(sim_data_dict) as f: + sim_data = pickle.load(f) + + # 2. From sim_data, get monomer IDs and mappings to mRNA/gene + mRNA_sim_array = sim_data.process.transcription.cistron_data.struct_array + monomer_sim_array = sim_data.process.translation.monomer_data.struct_array + monomer_ids: list[str] = monomer_sim_array["id"].tolist() + + # Build mapping: monomer_id -> mRNA_id -> gene_id + monomer_to_mRNA: dict[str, str] = dict( + zip(monomer_sim_array["id"], monomer_sim_array["cistron_id"]) + ) + mRNA_to_gene: dict[str, str] = dict( + zip(mRNA_sim_array["id"], mRNA_sim_array["gene_id"]) + ) + + # 3. Determine listener names / field names in vEcoli for target & actual probabilities. + # You need to replace the placeholders below with the real listener/field names + # in your DuckDB schema / parquet emitter config. + # Common pattern: listener name might be "ribosome_data" and fields like + # "target_prob_translation_per_transcript" and "actual_prob_translation_per_transcript". + # + # For example, if in config_sql you have a listener named "ribosome_data" and + # parquet columns named "target_prob_translation_per_transcript_", + # then you may use field_metadata to get the list of column names and named_idx to select indices. + # + # 3.a. Fetch all column names for target probabilities + # TODO: replace "listeners__ribosome_data__target_prob_translation_per_transcript" + # with the actual listener name used in your vEcoli config. + try: + target_columns: list[str] = field_metadata( + conn, + config_sql, + "listeners__ribosome_data__target_prob_translation_per_transcript", + ) + actual_columns: list[str] = field_metadata( + conn, + config_sql, + "listeners__ribosome_data__actual_prob_translation_per_transcript", + ) + except Exception: + # If the above naming is incorrect, adjust to your listener naming convention. + raise RuntimeError( + "Failed to fetch field metadata for ribosome data. " + "Please replace the listener names in field_metadata(...) with your actual names." + ) + + # 3.b. Build index dicts: column name -> index in the wide array + # We assume that field_metadata returns a list of column names in the same order + # as the underlying array dimension for translation probabilities. + target_idx_dict = {col: i for i, col in enumerate(target_columns)} + actual_idx_dict = {col: i for i, col in enumerate(actual_columns)} + + # 3.c. Determine indexes for all monomer_ids in these columns + # Here we assume that column names in target_columns/actual_columns directly match monomer_ids. + # If not, adjust the mapping logic accordingly. + target_indexes: list[int] = [] + actual_indexes: list[int] = [] + missing_target = [] + missing_actual = [] + for mon in monomer_ids: + if mon in target_idx_dict: + target_indexes.append(target_idx_dict[mon]) + else: + missing_target.append(mon) + if mon in actual_idx_dict: + actual_indexes.append(actual_idx_dict[mon]) + else: + missing_actual.append(mon) + if missing_target or missing_actual: + # Warn user that some monomers are not present in the listener fields + print( + f"Warning: some monomer IDs not found in target/actual fields. " + f"Missing in target: {missing_target[:5]}{'...' if len(missing_target) > 5 else ''}; " + f"Missing in actual: {missing_actual[:5]}{'...' if len(missing_actual) > 5 else ''}." + ) + # Continue with intersection of available monomers + # Use only those present in both + # Find intersection in order of monomer_ids: + valid_monomer_ids = [ + mon for mon in monomer_ids if mon in target_idx_dict and mon in actual_idx_dict + ] + valid_target_indexes = [target_idx_dict[mon] for mon in valid_monomer_ids] + valid_actual_indexes = [actual_idx_dict[mon] for mon in valid_monomer_ids] + + if not valid_monomer_ids: + print( + "No overlapping monomer IDs found in ribosome_data listeners; aborting plot." + ) + return + + # 4. Read stacked columns: time + target + actual arrays. + # First read target data: + target_named = named_idx( + "listenersribosome_data_target_prob_translation_per_transcript", + valid_monomer_ids, + valid_target_indexes, + ) + # Then read actual data: + actual_named = named_idx( + "listenersribosome_data_actual_prob_translation_per_transcript", + valid_monomer_ids, + valid_actual_indexes, + ) + # Note: 上面 named_idx 的第一个参数需要替换为你项目中实际的 listener 名称前缀,例如 + # "listeners__ribosome_data__target_prob_translation_per_transcript" 或类似,确保与 field_metadata(...) 中使用的 listener 匹配。 + + # Read time + these columns. read_stacked_columns 返回 dict-like,包含 "time" 字段和各 monomer 列。 + # 如果 time 字段命名不是 "time",请调整。 + target_data = read_stacked_columns(history_sql, [target_named], conn=conn) + actual_data = read_stacked_columns(history_sql, [actual_named], conn=conn) + + # Convert to Polars DataFrame + df_target = pl.DataFrame(target_data) + df_actual = pl.DataFrame(actual_data) + + # Assume both have a "time" column; drop duplicate time in actual + if "time" in df_actual.columns: + df_actual = df_actual.drop("time") + + # 5. Rename columns to distinguish target vs actual + # e.g., columns are monomer IDs; 重命名为 target_ / actual_ + rename_target = {mon: f"target_{mon}" for mon in valid_monomer_ids} + rename_actual = {mon: f"actual_{mon}" for mon in valid_monomer_ids} + df_target = df_target.rename(rename_target) + df_actual = df_actual.rename(rename_actual) + + # Merge horizontally on row order (time) + df = pl.concat([df_target, df_actual], how="horizontal") + # Create Time (min) column + if "time" in df.columns: + df = df.with_columns((pl.col("time") / 60).alias("Time (min)")) + else: + raise RuntimeError("No 'time' column found in ribosome data readout.") + + # Compute overcrowded monomer indices: where max(target - actual) > 0 + # We'll convert to NumPy for efficient max along time axis. + # Build numpy arrays in matching order + # n = len(valid_monomer_ids) + # Stack arrays: shape (T, n) + target_matrix = np.vstack( + [df[f"target_{mon}"].to_numpy() for mon in valid_monomer_ids] + ).T + actual_matrix = np.vstack( + [df[f"actual_{mon}"].to_numpy() for mon in valid_monomer_ids] + ).T + diff = target_matrix - actual_matrix + # max over time for each monomer + max_diff = diff.max(axis=0) + overcrowded_mask = max_diff > 0 + overcrowded_indices = np.where(overcrowded_mask)[0].tolist() + n_overcrowded = len(overcrowded_indices) + + if n_overcrowded == 0: + print("No overcrowded mRNAs detected in this simulation; nothing to plot.") + return + + # Limit number to plot + n_to_plot = min(n_overcrowded, MAX_NUMBER_OF_MONOMERS_TO_PLOT) + if n_overcrowded > MAX_NUMBER_OF_MONOMERS_TO_PLOT: + print( + f"Total overcrowded proteins: {n_overcrowded}. " + f"Plotting first {MAX_NUMBER_OF_MONOMERS_TO_PLOT} only." + ) + + # For each overcrowded monomer, get gene ID and build a small Altair chart. + charts = [] + for idx_in_list in overcrowded_indices[:n_to_plot]: + monomer = valid_monomer_ids[idx_in_list] + gene = mRNA_to_gene.get(monomer_to_mRNA.get(monomer, ""), "unknown") + # Build a pandas DataFrame for plotting: columns Time (min), target, actual + # Use Polars to pandas conversion for this single monomer: + pd_df = ( + df.select(["Time (min)", f"target_{monomer}", f"actual_{monomer}"]) + .rename( + { + "Time (min)": "Time (min)", + f"target_{monomer}": "target", + f"actual_{monomer}": "actual", + } + ) + .to_pandas() + ) + # Melt to long form + pd_long = pd_df.melt( + id_vars=["Time (min)"], + value_vars=["target", "actual"], + var_name="Type", + value_name="Probability", + ) + # Create line chart + chart = ( + alt.Chart(pd_long) + .mark_line() + .encode( + x=alt.X("Time (min)", title="Time (min)"), + y=alt.Y("Probability", title="Translation Probability"), + color="Type", + ) + .properties(title=f"{gene} (monomer {monomer})") + .interactive() + ) + charts.append(chart) + + # Vertically concatenate all charts + combined = ( + alt.vconcat(*charts) + .configure_axis(labelFontSize=10, titleFontSize=12) + .configure_title(fontSize=14) + ) + + # Save to HTML + os.makedirs(outdir, exist_ok=True) + outpath = os.path.join(outdir, "ribosome_crowding.html") + combined.save(outpath) + print(f"Saved ribosome crowding plot to {outpath}") diff --git a/ecoli/analysis/multigeneration/rna_decay_03_high.py b/ecoli/analysis/multigeneration/rna_decay_03_high.py new file mode 100644 index 000000000..65c52dde1 --- /dev/null +++ b/ecoli/analysis/multigeneration/rna_decay_03_high.py @@ -0,0 +1,256 @@ +""" +Plot dynamic traces of genes with high expression (> 20 counts of mRNA) + +EG10367_RNA 24.8 gapA Glyceraldehyde 3-phosphate dehydrogenase +EG11036_RNA 25.2 tufA Elongation factor Tu +EG50002_RNA 26.2 rpmA 50S Ribosomal subunit protein L27 +EG10671_RNA 30.1 ompF Outer membrane protein F +EG50003_RNA 38.7 acpP Apo-[acyl carrier protein] +EG10669_RNA 41.1 ompA Outer membrane protein A +EG10873_RNA 44.7 rplL 50S Ribosomal subunit protein L7/L12 dimer +EG12179_RNA 46.2 cspE Transcription antiterminator and regulator of RNA stability +EG10321_RNA 53.2 fliC Flagellin +EG10544_RNA 97.5 lpp Murein lipoprotein +""" + +import altair as alt +import os +from typing import Any, cast +import pickle +import polars as pl +import numpy as np + +from duckdb import DuckDBPyConnection +from ecoli.library.parquet_emitter import ( + field_metadata, + open_arbitrary_sim_data, + named_idx, + read_stacked_columns, +) + + +def plot( + params: dict[str, Any], + conn: DuckDBPyConnection, + history_sql: str, + config_sql: str, + success_sql: str, + sim_data_dict: dict[str, dict[int, str]], + validation_data_paths: list[str], + outdir: str, + variant_metadata: dict[str, dict[int, Any]], + variant_names: dict[str, str], +): + # Load sim_data + with open_arbitrary_sim_data(sim_data_dict) as f: + sim_data = pickle.load(f) + + all_cistron_ids = sim_data.process.transcription.cistron_data["id"].tolist() + + cistron_ids = [ + "EG10367_RNA", + "EG11036_RNA", + "EG50002_RNA", + "EG10671_RNA", + "EG50003_RNA", + "EG10669_RNA", + "EG10873_RNA", + "EG12179_RNA", + "EG10321_RNA", + "EG10544_RNA", + ] + + names = [ + "gapA - Glyceraldehyde 3-phosphate dehydrogenase", + "tufA - Elongation factor Tu", + "rpmA - 50S Ribosomal subunit protein L27", + "ompF - Outer membrane protein F", + "acpP - Apo-[acyl carrier protein]", + "ompA - Outer membrane protein A", + "rplL - 50S Ribosomal subunit protein L7/L12 dimer", + "cspE - Transcription antiterminator and regulator of RNA stability", + "fliC - Flagellin", + "lpp - Murein lipoprotein", + ] + + cistron_idxs = [all_cistron_ids.index(x) for x in cistron_ids] + deg_rates = sim_data.process.transcription.cistron_data["deg_rate"][cistron_idxs] + + # Get indexes for the specific cistrons we want to track + rna_degradation_idx_dict = { + cistron: i + for i, cistron in enumerate( + field_metadata( + conn, + config_sql, + "listenersrna_degradation__count_RNA_degraded_per_cistron", + ) + ) + } + + rna_counts_idx_dict = { + cistron: i + for i, cistron in enumerate( + field_metadata(conn, config_sql, "listenersrna_counts__mRNA_cistron_counts") + ) + } + + cistron_degradation_indexes = [ + cast(int, rna_degradation_idx_dict.get(cistron_id)) + for cistron_id in cistron_ids + ] + + cistron_counts_indexes = [ + cast(int, rna_counts_idx_dict.get(cistron_id)) for cistron_id in cistron_ids + ] + + # Load data using vEcoli pattern + degradation_columns = named_idx( + "listenersrna_degradation__count_RNA_degraded_per_cistron", + cistron_ids, + cistron_degradation_indexes, + ) + + counts_columns = named_idx( + "listenersrna_counts__mRNA_cistron_counts", cistron_ids, cistron_counts_indexes + ) + + # Read data + data = read_stacked_columns( + history_sql, + [degradation_columns, counts_columns, "time", "timeStepSec"], + conn=conn, + ) + + df = pl.DataFrame(data) + + # Convert to numpy arrays for processing (similar to original logic) + N = 100 # smoothing window + + # Group by simulation and process each separately + processed_data = [] + + for sim_data_group in df.group_by(["variant", "seed", "generation"]): + sim_df = sim_data_group[1].sort("time") + + # Extract arrays for this simulation + dt = sim_df["timeStepSec"].to_numpy() + + # Process degradation counts + degraded_counts = np.column_stack( + [ + sim_df[ + f"listenersrna_degradation__count_RNA_degraded_per_cistron__{cistron_id}" + ].to_numpy() + for cistron_id in cistron_ids + ] + ) + + # Process RNA counts + rna_counts = np.column_stack( + [ + sim_df[ + f"listenersrna_counts__mRNA_cistron_counts__{cistron_id}" + ].to_numpy() + for cistron_id in cistron_ids + ] + ) + + # Apply smoothing (similar to original) + if len(dt) > 2 * N: + degraded_smoothed = np.nan * np.ones_like(degraded_counts) + counts_smoothed = np.nan * np.ones_like(rna_counts) + + for col_idx in range(degraded_counts.shape[1]): + # Smooth degradation rates + degraded_smoothed[:, col_idx] = np.convolve( + degraded_counts[:, col_idx] / dt, np.ones(N) / N, mode="same" + ) + # Smooth counts + counts_smoothed[:, col_idx] = np.convolve( + rna_counts[:, col_idx], np.ones(N) / N, mode="same" + ) + + # Trim edges + degraded_trimmed = degraded_smoothed[N:-N, :] + counts_trimmed = counts_smoothed[N:-N, :] + + processed_data.append( + { + "degraded": degraded_trimmed, + "counts": counts_trimmed, + "variant": sim_data_group[1]["variant"].iloc[0], + "seed": sim_data_group[1]["seed"].iloc[0], + "generation": sim_data_group[1]["generation"].iloc[0], + } + ) + + if not processed_data: + print("No data available for processing") + return + + # Combine all processed data + all_degraded = np.vstack([d["degraded"] for d in processed_data]) + all_counts = np.vstack([d["counts"] for d in processed_data]) + + # Create subplot charts using Altair + charts = [] + + for subplot_idx in range( + min(9, len(cistron_ids)) + ): # Limit to 9 subplots like original + if subplot_idx >= len(cistron_ids): + break + + y = all_degraded[:, subplot_idx] + A = all_counts[:, subplot_idx] + + try: + # Calculate degradation rate using least squares + kdeg, _, _, _ = np.linalg.lstsq(A[:, np.newaxis], y, rcond=None) + kdeg = kdeg[0] + except (ValueError, np.linalg.LinAlgError): + print(f"Skipping subplot {subplot_idx} because not enough data") + continue + + # Subsample data for plotting (similar to original ::N) + plot_data = pl.DataFrame({"RNA_counts": A[::N], "RNA_degraded": y[::N]}) + + chart = ( + alt.Chart(plot_data) + .mark_circle() + .encode( + x=alt.X("RNA_counts:Q", title="RNA (counts)"), + y=alt.Y("RNA_degraded:Q", title="RNA degraded (counts)"), + ) + .properties( + title=f"{names[subplot_idx].split(' - ')[0]}\n" + f"kdeg meas: {kdeg:.1e}\n" + f"kdeg exp: {deg_rates[subplot_idx]:.1e}", + width=250, + height=200, + ) + ) + + charts.append(chart) + + # Arrange charts in 3x3 grid + if charts: + # Group charts into rows of 3 + rows = [] + for i in range(0, len(charts), 3): + row_charts = charts[i : i + 3] + if len(row_charts) == 1: + rows.append(row_charts[0]) + else: + rows.append(alt.hconcat(*row_charts)) + + # Combine rows vertically + if len(rows) == 1: + combined_plot = rows[0] + else: + combined_plot = alt.vconcat(*rows) + + combined_plot.save(os.path.join(outdir, "rna_decay_03_high.html")) + else: + print("No charts were generated due to insufficient data") From edb539c5cc0b181cc21bc5da138a8e44a19060cc Mon Sep 17 00:00:00 2001 From: Shu-Feather <2200017797@stu.pku.edu.cn> Date: Thu, 26 Jun 2025 02:41:29 +0800 Subject: [PATCH 02/71] Fix dimension mismatch in average_monomer_counts --- ecoli/analysis/multivariant/average_monomer_counts.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/ecoli/analysis/multivariant/average_monomer_counts.py b/ecoli/analysis/multivariant/average_monomer_counts.py index 460544eac..71aa91a77 100644 --- a/ecoli/analysis/multivariant/average_monomer_counts.py +++ b/ecoli/analysis/multivariant/average_monomer_counts.py @@ -126,7 +126,10 @@ def plot( avg_monomer_counts = ndlist_to_ndarray(variant_pair["avg_monomer_counts"]) # Save unfiltered data col_labels = ["all_monomer_ids", "var_0_avg_PCs", f"var_{exp_variant}_avg_PCs"] - values = np.concatenate((all_monomer_ids.T, avg_monomer_counts.T), axis=1) + # FIX: Reshape monomer IDs to 2D array before concatenation + all_monomer_ids_2d = all_monomer_ids.reshape(-1, 1) + # values = np.concatenate((all_monomer_ids.T, avg_monomer_counts.T), axis=1) + values = np.concatenate((all_monomer_ids_2d, avg_monomer_counts.T), axis=1) save_file( unfiltered_dir, f"wcm_full_monomers_{file_suffix}", col_labels, values ) @@ -145,7 +148,10 @@ def plot( "var_0_avg_PCs", f"var_{exp_variant}_avg_PCs", ] - values = np.concatenate((filtered_ids.T, avg_monomer_counts.T), axis=1) + # FIX: Reshape filtered IDs to 2D array before concatenation + filtered_ids_2d = filtered_ids.reshape(-1, 1) + # values = np.concatenate((filtered_ids.T, avg_monomer_counts.T), axis=1) + values = np.concatenate((filtered_ids_2d, avg_monomer_counts.T), axis=1) save_file( filtered_dir, f"wcm_filter_monomers_{file_suffix}", col_labels, values ) From 134f88ba139a11b612ff837f00de97ab59c6f3bf Mon Sep 17 00:00:00 2001 From: Shu-Feather <2200017797@stu.pku.edu.cn> Date: Thu, 26 Jun 2025 02:45:44 +0800 Subject: [PATCH 03/71] Modification and Add configs --- configs/hsy/multigen.json | 184 ++++++++++++++++++++++++ configs/hsy/multiseed.json | 181 ++++++++++++++++++++++++ configs/hsy/multivariant.json | 196 ++++++++++++++++++++++++++ configs/hsy/test.json | 19 +++ configs/hsy/variantAnalysis.json | 20 +++ ecoli/experiments/ecoli_master_sim.py | 1 + runscripts/analysis.py | 11 +- 7 files changed, 610 insertions(+), 2 deletions(-) create mode 100644 configs/hsy/multigen.json create mode 100644 configs/hsy/multiseed.json create mode 100644 configs/hsy/multivariant.json create mode 100644 configs/hsy/test.json create mode 100644 configs/hsy/variantAnalysis.json diff --git a/configs/hsy/multigen.json b/configs/hsy/multigen.json new file mode 100644 index 000000000..949bd7fb4 --- /dev/null +++ b/configs/hsy/multigen.json @@ -0,0 +1,184 @@ +{ + "inherit_from": [], + "experiment_id": "multigen", + "suffix_time": false, + "description": "A customized simulation for testing new parameters", + "progress_bar": true, + "sim_data_path": null, + "emitter": "parquet", + "emitter_arg": { + "out_dir": "out" + }, + "analysis_options": { + "single": {"mass_fraction_summary": {}}, + "multigeneration": { + "replication": {}, + "new_gene_counts": {}, + "ribosome_components": {}, + "ribosome_crowding": {}, + "ribosomeProduction": {}, + "ribosomeUsage": {}, + "rna_decay_03_high": {} + } + }, + "emit_topology": false, + "emit_processes": false, + "emit_config": false, + "emit_unique": false, + "log_updates": false, + "raw_output": true, + "seed": 0, + "mar_regulon": false, + "amp_lysis": false, + + "initial_state_file": "", + "initial_state_overrides": [], + "initial_state": {}, + "time_step": 1.0, + "total_time": 10800.0, + "initial_global_time": 0.0, + "fail_at_total_time": false, + + "variants": {}, + "skip_baseline": false, + "n_init_sims": 1, + "generations": 5, + "single_daughters": true, + "daughter_outdir": "out", + "lineage_seed": 0, + + "parca_options": { + "cpus": 6, + "outdir": "out", + "operons": true, + "ribosome_fitting": true, + "rnapoly_fitting": true, + "remove_rrna_operons": false, + "remove_rrff": false, + "stable_rrna": false, + "new_genes": "off", + "debug_parca": false, + "load_intermediate": null, + "save_intermediates": false, + "intermediates_directory": "", + "variable_elongation_transcription": true, + "variable_elongation_translation": false + }, + + "agent_id": "0", + "divide": true, + "d_period": true, + "division_threshold": true, + "division_variable": ["divide"], + "chromosome_path": ["unique", "full_chromosome"], + "spatial_environment": false, + "spatial_environment_config": {}, + "fixed_media": "minimal", + "condition": "basal", + + "save": false, + "save_times": [], + + "add_processes": [], + "exclude_processes": [], + "swap_processes": {}, + "profile": false, + "processes": [ + "post-division-mass-listener", + + "bulk-timeline", + "media_update", + "exchange_data", + + "ecoli-tf-unbinding", + + "ecoli-equilibrium", + "ecoli-two-component-system", + "ecoli-rna-maturation", + + "ecoli-tf-binding", + + "ecoli-transcript-initiation", + "ecoli-polypeptide-initiation", + "ecoli-chromosome-replication", + "ecoli-protein-degradation", + "ecoli-rna-degradation", + "ecoli-complexation", + + "ecoli-transcript-elongation", + "ecoli-polypeptide-elongation", + + "ecoli-chromosome-structure", + + "ecoli-metabolism", + + "ecoli-mass-listener", + "RNA_counts_listener", + "rna_synth_prob_listener", + "monomer_counts_listener", + "dna_supercoiling_listener", + "replication_data_listener", + "rnap_data_listener", + "unique_molecule_counts", + "ribosome_data_listener", + "global_clock" + ], + "process_configs": { + "global_clock": {}, + "replication_data_listener": {"time_step": 1} + }, + + "topology": { + "bulk-timeline": { + "bulk": ["bulk"], + "global": ["timeline"], + "media_id": ["environment", "media_id"] + }, + "global_clock": { + "global_time": ["global_time"], + "next_update_time": ["next_update_time"] + } + }, + + "flow": { + "post-division-mass-listener": [], + "media_update": [["post-division-mass-listener"]], + "exchange_data": [["media_update"]], + + "ecoli-tf-unbinding": [["media_update"]], + + "ecoli-equilibrium": [["ecoli-tf-unbinding"]], + "ecoli-two-component-system": [["ecoli-tf-unbinding"]], + "ecoli-rna-maturation": [["ecoli-tf-unbinding"]], + + "ecoli-tf-binding": [["ecoli-equilibrium"]], + + "ecoli-transcript-initiation": [["ecoli-tf-binding"]], + "ecoli-polypeptide-initiation": [["ecoli-tf-binding"]], + "ecoli-chromosome-replication": [["ecoli-tf-binding"]], + "ecoli-protein-degradation": [["ecoli-tf-binding"]], + "ecoli-rna-degradation": [["ecoli-tf-binding"]], + "ecoli-complexation": [["ecoli-tf-binding"]], + + "ecoli-transcript-elongation": [["ecoli-complexation"]], + "ecoli-polypeptide-elongation": [["ecoli-complexation"]], + + "ecoli-chromosome-structure": [["ecoli-polypeptide-elongation"]], + + "ecoli-metabolism": [["ecoli-chromosome-structure"]], + + "ecoli-mass-listener": [["ecoli-metabolism"]], + "RNA_counts_listener": [["ecoli-metabolism"]], + "rna_synth_prob_listener": [["ecoli-metabolism"]], + "monomer_counts_listener": [["ecoli-metabolism"]], + "dna_supercoiling_listener": [["ecoli-metabolism"]], + "replication_data_listener": [["ecoli-metabolism"]], + "rnap_data_listener": [["ecoli-metabolism"]], + "unique_molecule_counts": [["ecoli-metabolism"]], + "ribosome_data_listener": [["ecoli-metabolism"]] + }, + "engine_process_reports": [ + ["listeners"] + ], + "emit_paths": [] +} \ No newline at end of file diff --git a/configs/hsy/multiseed.json b/configs/hsy/multiseed.json new file mode 100644 index 000000000..a06c91d04 --- /dev/null +++ b/configs/hsy/multiseed.json @@ -0,0 +1,181 @@ +{ + "inherit_from": [], + "experiment_id": "multiseed", + "suffix_time": false, + "description": "A customized simulation for testing new parameters", + "progress_bar": true, + "sim_data_path": null, + "emitter": "parquet", + "emitter_arg": { + "out_dir": "out" + }, + "analysis_options": { + "single": {"mass_fraction_summary": {}}, + "multiseed": { + "ecocyc_table": {}, + "protein_counts_validation": {}, + "ribosome_spacing":{}, + "subgenerational_expression_table": {} + } + }, + "emit_topology": false, + "emit_processes": false, + "emit_config": false, + "emit_unique": false, + "log_updates": false, + "raw_output": true, + "seed": 0, + "mar_regulon": false, + "amp_lysis": false, + + "initial_state_file": "", + "initial_state_overrides": [], + "initial_state": {}, + "time_step": 1.0, + "total_time": 10800.0, + "initial_global_time": 0.0, + "fail_at_total_time": false, + + "variants": {}, + "skip_baseline": false, + "n_init_sims": 1, + "generations": 5, + "single_daughters": true, + "daughter_outdir": "out", + "lineage_seed": 0, + + "parca_options": { + "cpus": 6, + "outdir": "out", + "operons": true, + "ribosome_fitting": true, + "rnapoly_fitting": true, + "remove_rrna_operons": false, + "remove_rrff": false, + "stable_rrna": false, + "new_genes": "off", + "debug_parca": false, + "load_intermediate": null, + "save_intermediates": false, + "intermediates_directory": "", + "variable_elongation_transcription": true, + "variable_elongation_translation": false + }, + + "agent_id": "0", + "divide": true, + "d_period": true, + "division_threshold": true, + "division_variable": ["divide"], + "chromosome_path": ["unique", "full_chromosome"], + "spatial_environment": false, + "spatial_environment_config": {}, + "fixed_media": "minimal", + "condition": "basal", + + "save": false, + "save_times": [], + + "add_processes": [], + "exclude_processes": [], + "swap_processes": {}, + "profile": false, + "processes": [ + "post-division-mass-listener", + + "bulk-timeline", + "media_update", + "exchange_data", + + "ecoli-tf-unbinding", + + "ecoli-equilibrium", + "ecoli-two-component-system", + "ecoli-rna-maturation", + + "ecoli-tf-binding", + + "ecoli-transcript-initiation", + "ecoli-polypeptide-initiation", + "ecoli-chromosome-replication", + "ecoli-protein-degradation", + "ecoli-rna-degradation", + "ecoli-complexation", + + "ecoli-transcript-elongation", + "ecoli-polypeptide-elongation", + + "ecoli-chromosome-structure", + + "ecoli-metabolism", + + "ecoli-mass-listener", + "RNA_counts_listener", + "rna_synth_prob_listener", + "monomer_counts_listener", + "dna_supercoiling_listener", + "replication_data_listener", + "rnap_data_listener", + "unique_molecule_counts", + "ribosome_data_listener", + "global_clock" + ], + "process_configs": { + "global_clock": {}, + "replication_data_listener": {"time_step": 1} + }, + + "topology": { + "bulk-timeline": { + "bulk": ["bulk"], + "global": ["timeline"], + "media_id": ["environment", "media_id"] + }, + "global_clock": { + "global_time": ["global_time"], + "next_update_time": ["next_update_time"] + } + }, + + "flow": { + "post-division-mass-listener": [], + "media_update": [["post-division-mass-listener"]], + "exchange_data": [["media_update"]], + + "ecoli-tf-unbinding": [["media_update"]], + + "ecoli-equilibrium": [["ecoli-tf-unbinding"]], + "ecoli-two-component-system": [["ecoli-tf-unbinding"]], + "ecoli-rna-maturation": [["ecoli-tf-unbinding"]], + + "ecoli-tf-binding": [["ecoli-equilibrium"]], + + "ecoli-transcript-initiation": [["ecoli-tf-binding"]], + "ecoli-polypeptide-initiation": [["ecoli-tf-binding"]], + "ecoli-chromosome-replication": [["ecoli-tf-binding"]], + "ecoli-protein-degradation": [["ecoli-tf-binding"]], + "ecoli-rna-degradation": [["ecoli-tf-binding"]], + "ecoli-complexation": [["ecoli-tf-binding"]], + + "ecoli-transcript-elongation": [["ecoli-complexation"]], + "ecoli-polypeptide-elongation": [["ecoli-complexation"]], + + "ecoli-chromosome-structure": [["ecoli-polypeptide-elongation"]], + + "ecoli-metabolism": [["ecoli-chromosome-structure"]], + + "ecoli-mass-listener": [["ecoli-metabolism"]], + "RNA_counts_listener": [["ecoli-metabolism"]], + "rna_synth_prob_listener": [["ecoli-metabolism"]], + "monomer_counts_listener": [["ecoli-metabolism"]], + "dna_supercoiling_listener": [["ecoli-metabolism"]], + "replication_data_listener": [["ecoli-metabolism"]], + "rnap_data_listener": [["ecoli-metabolism"]], + "unique_molecule_counts": [["ecoli-metabolism"]], + "ribosome_data_listener": [["ecoli-metabolism"]] + }, + "engine_process_reports": [ + ["listeners"] + ], + "emit_paths": [] +} \ No newline at end of file diff --git a/configs/hsy/multivariant.json b/configs/hsy/multivariant.json new file mode 100644 index 000000000..25d40defa --- /dev/null +++ b/configs/hsy/multivariant.json @@ -0,0 +1,196 @@ +{ + "inherit_from": [], + "experiment_id": "multivariant", + "suffix_time": false, + "description": "A customized simulation for testing new parameters", + "progress_bar": true, + "sim_data_path": null, + "emitter": "parquet", + "emitter_arg": { + "out_dir": "out" + }, + "analysis_options": { + "single": {"mass_fraction_summary": {}}, + "multivariant": { + "average_monomer_counts": {}, + "doubling_time_hist": {}, + "doubling_time_line": {}, + "dummy": {}, + "new_gene_translation_efficiency_heatmaps": {} + } + }, + "emit_topology": false, + "emit_processes": false, + "emit_config": false, + "emit_unique": false, + "log_updates": false, + "raw_output": true, + "seed": 0, + "mar_regulon": false, + "amp_lysis": false, + + "initial_state_file": "", + "initial_state_overrides": [], + "initial_state": {}, + "time_step": 1.0, + "total_time": 10800.0, + "initial_global_time": 0.0, + "fail_at_total_time": false, + + "variants": { + "variant_test": { + "a": {"value": [1, 2]}, + "b": {"value": ["one", "two"]}, + "c": { + "nested": { + "d": {"value": [3, 4]}, + "e": {"value": [5.0, 6.0]}, + "op": "zip" + } + }, + "op": "prod" + } + }, + + "skip_baseline": false, + "n_init_sims": 1, + "generations": 2, + "single_daughters": true, + "daughter_outdir": "out", + "lineage_seed": 0, + + "parca_options": { + "cpus": 6, + "outdir": "out", + "operons": true, + "ribosome_fitting": true, + "rnapoly_fitting": true, + "remove_rrna_operons": false, + "remove_rrff": false, + "stable_rrna": false, + "new_genes": "off", + "debug_parca": false, + "load_intermediate": null, + "save_intermediates": false, + "intermediates_directory": "", + "variable_elongation_transcription": true, + "variable_elongation_translation": false + }, + + "agent_id": "0", + "divide": true, + "d_period": true, + "division_threshold": true, + "division_variable": ["divide"], + "chromosome_path": ["unique", "full_chromosome"], + "spatial_environment": false, + "spatial_environment_config": {}, + "fixed_media": "minimal", + "condition": "basal", + + "save": false, + "save_times": [], + + "add_processes": [], + "exclude_processes": [], + "swap_processes": {}, + "profile": false, + "processes": [ + "post-division-mass-listener", + + "bulk-timeline", + "media_update", + "exchange_data", + + "ecoli-tf-unbinding", + + "ecoli-equilibrium", + "ecoli-two-component-system", + "ecoli-rna-maturation", + + "ecoli-tf-binding", + + "ecoli-transcript-initiation", + "ecoli-polypeptide-initiation", + "ecoli-chromosome-replication", + "ecoli-protein-degradation", + "ecoli-rna-degradation", + "ecoli-complexation", + + "ecoli-transcript-elongation", + "ecoli-polypeptide-elongation", + + "ecoli-chromosome-structure", + + "ecoli-metabolism", + + "ecoli-mass-listener", + "RNA_counts_listener", + "rna_synth_prob_listener", + "monomer_counts_listener", + "dna_supercoiling_listener", + "replication_data_listener", + "rnap_data_listener", + "unique_molecule_counts", + "ribosome_data_listener", + "global_clock" + ], + "process_configs": { + "global_clock": {}, + "replication_data_listener": {"time_step": 1} + }, + + "topology": { + "bulk-timeline": { + "bulk": ["bulk"], + "global": ["timeline"], + "media_id": ["environment", "media_id"] + }, + "global_clock": { + "global_time": ["global_time"], + "next_update_time": ["next_update_time"] + } + }, + + "flow": { + "post-division-mass-listener": [], + "media_update": [["post-division-mass-listener"]], + "exchange_data": [["media_update"]], + + "ecoli-tf-unbinding": [["media_update"]], + + "ecoli-equilibrium": [["ecoli-tf-unbinding"]], + "ecoli-two-component-system": [["ecoli-tf-unbinding"]], + "ecoli-rna-maturation": [["ecoli-tf-unbinding"]], + + "ecoli-tf-binding": [["ecoli-equilibrium"]], + + "ecoli-transcript-initiation": [["ecoli-tf-binding"]], + "ecoli-polypeptide-initiation": [["ecoli-tf-binding"]], + "ecoli-chromosome-replication": [["ecoli-tf-binding"]], + "ecoli-protein-degradation": [["ecoli-tf-binding"]], + "ecoli-rna-degradation": [["ecoli-tf-binding"]], + "ecoli-complexation": [["ecoli-tf-binding"]], + + "ecoli-transcript-elongation": [["ecoli-complexation"]], + "ecoli-polypeptide-elongation": [["ecoli-complexation"]], + + "ecoli-chromosome-structure": [["ecoli-polypeptide-elongation"]], + + "ecoli-metabolism": [["ecoli-chromosome-structure"]], + + "ecoli-mass-listener": [["ecoli-metabolism"]], + "RNA_counts_listener": [["ecoli-metabolism"]], + "rna_synth_prob_listener": [["ecoli-metabolism"]], + "monomer_counts_listener": [["ecoli-metabolism"]], + "dna_supercoiling_listener": [["ecoli-metabolism"]], + "replication_data_listener": [["ecoli-metabolism"]], + "rnap_data_listener": [["ecoli-metabolism"]], + "unique_molecule_counts": [["ecoli-metabolism"]], + "ribosome_data_listener": [["ecoli-metabolism"]] + }, + "engine_process_reports": [ + ["listeners"] + ], + "emit_paths": [] +} \ No newline at end of file diff --git a/configs/hsy/test.json b/configs/hsy/test.json new file mode 100644 index 000000000..6b775b732 --- /dev/null +++ b/configs/hsy/test.json @@ -0,0 +1,19 @@ +{ + "experiment_id": "test_multigen_replication", + "suffix_time": false, + "parca_options": { + "cpus": 4 + }, + "fail_at_total_time": true, + "sim_data_path": null, + "generations": 3, + "n_init_sims": 1, + "single_daughters": true, + "emitter": "parquet", + "emitter_arg": { + "out_dir": "out" + }, + "analysis_options": { + "multigeneration": {"replication": {}} + } +} diff --git a/configs/hsy/variantAnalysis.json b/configs/hsy/variantAnalysis.json new file mode 100644 index 000000000..5718d6e37 --- /dev/null +++ b/configs/hsy/variantAnalysis.json @@ -0,0 +1,20 @@ +{ + "emitter_arg": { + "out_dir": "out" + }, + "analysis_options": { + "experiment_id": ["multivariant"], + "variant": [0, 1, 2, 3, 4, 5, 6, 7, 8], + "lineage_seed": [0], + "generation": [1, 2], + "agent_id": ["0"], + "validation_data_path": ["out/multivariant/parca/kb/validationData.cPickle"], + "variant_data_dir": ["out/multivariant/variant_sim_data"], + "outdir": "out/multivariant/analyses", + "cpus": 4, + + "multivariant": { + "average_monomer_counts": {} + } + } +} diff --git a/ecoli/experiments/ecoli_master_sim.py b/ecoli/experiments/ecoli_master_sim.py index 8513b6c3c..b4525fe61 100644 --- a/ecoli/experiments/ecoli_master_sim.py +++ b/ecoli/experiments/ecoli_master_sim.py @@ -353,6 +353,7 @@ def __init__( type=float, action="store", help="Initial time in context of whole lineage.", + default=0.0, ) self.parser.add_argument( "--fail_at_total_time", diff --git a/runscripts/analysis.py b/runscripts/analysis.py index 0ac0bc803..b94acd1a5 100644 --- a/runscripts/analysis.py +++ b/runscripts/analysis.py @@ -304,6 +304,7 @@ def main(): variant_names = {config["experiment_id"][0]: variant_name} # Establish DuckDB connection + # print(f"[DEBUG] The out_uri for analyses is: {out_uri}") conn = create_duckdb_conn(out_uri, gcs_bucket, config.get("cpus")) history_sql, config_sql, success_sql = dataset_sql(out_uri, config["experiment_id"]) # If no explicit analysis type given, run all types in config JSON @@ -324,7 +325,7 @@ def main(): # Figure out what Hive partition in main output directory # to store outputs for analyses run on this cell subset curr_outdir = os.path.abspath(config["outdir"]) - config_outdir = curr_outdir + # config_outdir = curr_outdir if len(cols) > 0: joined_cols = ", ".join(cols) data_ids = conn.sql( @@ -371,8 +372,14 @@ def main(): variant_names, ) + top_outdir = os.path.abspath(config["outdir"]) + os.makedirs(top_outdir, exist_ok=True) + # Save copy of config JSON with parameters for plots - with open(os.path.join(config_outdir, "metadata.json"), "w") as f: + # with open(os.path.join(config_outdir, "metadata.json"), "w") as f: + # json.dump(config, f) + + with open(os.path.join(top_outdir, "metadata.json"), "w") as f: json.dump(config, f) From ad329d50e5b43c9989230e982cfdd6d3d1e40b51 Mon Sep 17 00:00:00 2001 From: Shu-Feather <2200017797@stu.pku.edu.cn> Date: Fri, 27 Jun 2025 05:04:11 +0800 Subject: [PATCH 04/71] Migration --- configs/hsy/genAnalysis.json | 20 ++ ecoli/analysis/multigeneration/replication.py | 329 +++++++++++------ .../multigeneration/ribosome_components.py | 338 ++++++------------ 3 files changed, 351 insertions(+), 336 deletions(-) create mode 100644 configs/hsy/genAnalysis.json diff --git a/configs/hsy/genAnalysis.json b/configs/hsy/genAnalysis.json new file mode 100644 index 000000000..eb9ccb45c --- /dev/null +++ b/configs/hsy/genAnalysis.json @@ -0,0 +1,20 @@ +{ + "emitter_arg": { + "out_dir": "out" + }, + "analysis_options": { + "experiment_id": ["multigen"], + "variant": [0], + "lineage_seed": [0], + "generation": [1, 2, 3, 4, 5], + "agent_id": ["0"], + "variant_data_dir": ["out/multigen/variant_sim_data"], + "validation_data_path": ["out/multigen/parca/kb/validationData.cPickle"], + "outdir": "out/multigen/analyses", + "cpus": 4, + + "multigeneration": { + "ribosome_components": {} + } + } +} diff --git a/ecoli/analysis/multigeneration/replication.py b/ecoli/analysis/multigeneration/replication.py index 883f32f17..512e4b431 100644 --- a/ecoli/analysis/multigeneration/replication.py +++ b/ecoli/analysis/multigeneration/replication.py @@ -27,72 +27,125 @@ def plot( variant_metadata: dict[str, dict[int, Any]], variant_names: dict[str, str], ): + """Create comprehensive replication visualization plots for E. coli simulation data.""" # Load sim_data to get genome length with open_arbitrary_sim_data(sim_data_dict) as f: sim_data = pickle.load(f) genome_length = len(sim_data.process.replication.genome_sequence) - # Load all the required data - data_columns = [ - "time", - "listenersreplication_datanumberOfOric", - "listenersreplication_datacriticalInitiationMass", - "listenersreplication_datacriticalMassPerOriC", - "listenersreplication_datafork_coordinates", - "listenersmassdrymass", - "listenersmasscellmass", + # Discover available columns + result = conn.sql(f"DESCRIBE ({history_sql})").pl() + available_columns = result["column_name"].to_list() + + # Filter for relevant columns + replication_columns = [ + col for col in available_columns if "replication" in col.lower() ] + mass_columns = [col for col in available_columns if "mass" in col.lower()] - plot_data = read_stacked_columns( - history_sql, - data_columns, - conn=conn, + print( + f"Found {len(replication_columns)} replication columns and {len(mass_columns)} mass columns" ) + # Define required columns mapping + column_mapping = { + "number_of_oric": next( + (col for col in available_columns if "number_of_oric" in col), None + ), + "fork_coordinates": next( + (col for col in available_columns if "fork_coordinates" in col), None + ), + "cell_mass": next( + ( + col + for col in available_columns + if "cell_mass" in col and "fold_change" not in col + ), + None, + ), + "dry_mass": next( + ( + col + for col in available_columns + if "dry_mass" in col and "fold_change" not in col + ), + None, + ), + "critical_initiation_mass": next( + (col for col in available_columns if "critical_initiation_mass" in col), + None, + ), + "critical_mass_per_oric": next( + (col for col in available_columns if "critical_mass_per_oric" in col), None + ), + } + + # Build list of columns to load + data_columns = ["time"] + for key, col_name in column_mapping.items(): + if col_name: + data_columns.append(col_name) + print(f"Using {col_name} for {key}") + + # Load data + plot_data = read_stacked_columns(history_sql, data_columns, conn=conn) + # Convert to DataFrame and add time in hours df = pl.DataFrame(plot_data).with_columns( - **{"Time (hr)": pl.col("time") / 3600} # Convert seconds to hours + pl.col("time").truediv(3600).alias("Time (hr)") ) - # Calculate pairs of forks from fork coordinates - # Fork coordinates is a 2D array, count non-NaN values and divide by 2 - fork_coords = df["listenersreplication_datafork_coordinates"].to_numpy() - pairs_of_forks = [] - for coord_array in fork_coords: - if coord_array is not None and len(coord_array) > 0: - pairs_of_forks.append(np.sum(~np.isnan(coord_array)) / 2) - else: - pairs_of_forks.append(0) + print(f"Loaded data: {df.shape[0]} rows, {df.shape[1]} columns") + + # Process fork coordinates and calculate pairs of forks + if column_mapping["fork_coordinates"]: + fork_coords_col = column_mapping["fork_coordinates"] + pairs_of_forks = [] - df = df.with_columns(pl.Series("pairs_of_forks", pairs_of_forks)) + for coord_array in df[fork_coords_col].to_numpy(): + if coord_array is not None and len(coord_array) > 0: + # Count non-NaN coordinates and divide by 2 for pairs + pairs_of_forks.append(np.sum(~np.isnan(coord_array)) / 2) + else: + pairs_of_forks.append(0) + + df = df.with_columns(pl.Series("pairs_of_forks", pairs_of_forks)) # Calculate critical mass equivalents - df = df.with_columns( - ( - pl.col("listenersmasscellmass") - / pl.col("listenersreplication_datacriticalInitiationMass") - ).alias("critical_mass_equivalents") - ) + if column_mapping["cell_mass"] and column_mapping["critical_initiation_mass"]: + df = df.with_columns( + ( + pl.col(column_mapping["cell_mass"]) + / pl.col(column_mapping["critical_initiation_mass"]) + ).alias("critical_mass_equivalents") + ) - # Create individual plots + # Create visualization functions + def create_fork_positions_plot(): + """Create DNA polymerase positions scatter plot.""" + if not column_mapping["fork_coordinates"]: + return None - # 1. Fork positions plot - this is complex due to the 2D nature, we'll create a simplified version - fork_positions_data = [] - for i, (time_val, coords) in enumerate(zip(df["Time (hr)"], fork_coords)): - if coords is not None and len(coords) > 0: - for coord in coords: - if not np.isnan(coord): - fork_positions_data.append( - {"Time (hr)": time_val, "Position": coord} - ) + fork_positions_data = [] + fork_coords_col = column_mapping["fork_coordinates"] + + for time_val, coords in zip(df["Time (hr)"], df[fork_coords_col]): + if coords is not None and len(coords) > 0: + for coord in coords: + if not np.isnan(coord): + fork_positions_data.append( + {"Time (hr)": time_val, "Position": coord} + ) + + if not fork_positions_data: + return None - if fork_positions_data: fork_df = pl.DataFrame(fork_positions_data) - fork_plot = ( + return ( alt.Chart(fork_df) - .mark_circle(size=5) + .mark_circle(size=5, opacity=0.7) .encode( - x=alt.X("Time (hr):Q"), + x=alt.X("Time (hr):Q", title="Time (hr)"), y=alt.Y( "Position:Q", scale=alt.Scale(domain=[-genome_length / 2, genome_length / 2]), @@ -103,76 +156,140 @@ def plot( title="DNA polymerase position (nt)", ), ) - .properties(title="DNA Polymerase Positions", width=600, height=100) + .properties(title="DNA Polymerase Positions", width=600, height=120) ) - else: - # Create empty plot if no fork data - fork_plot = ( - alt.Chart(pl.DataFrame({"x": [0], "y": [0]})) - .mark_text(text="No fork data available") - .encode(x="x:Q", y="y:Q") - .properties(width=600, height=100) + + def create_pairs_of_forks_plot(): + """Create pairs of replication forks line plot.""" + if "pairs_of_forks" not in df.columns: + return None + + return ( + alt.Chart(df.to_pandas()) + .mark_line(strokeWidth=2) + .encode( + x=alt.X("Time (hr):Q", title="Time (hr)"), + y=alt.Y( + "pairs_of_forks:Q", + scale=alt.Scale(domain=[0, 6]), + title="Pairs of forks", + ), + ) + .properties(title="Pairs of Replication Forks", width=600, height=100) ) - # 2. Pairs of forks plot - pairs_plot = df.plot.line( - x="Time (hr)", - y=alt.Y( - "pairs_of_forks", scale=alt.Scale(domain=[0, 6]), title="Pairs of forks" - ), - ).properties(title="Pairs of Replication Forks", width=600, height=100) + def create_critical_mass_plot(): + """Create critical mass equivalents plot with reference lines.""" + if "critical_mass_equivalents" not in df.columns: + return None - # 3. Critical mass equivalents plot with reference lines - base_critical_plot = df.plot.line( - x="Time (hr)", - y=alt.Y( - "critical_mass_equivalents", title="Factors of critical initiation mass" - ), - ) + # Main line plot + base_plot = ( + alt.Chart(df.to_pandas()) + .mark_line(strokeWidth=2) + .encode( + x=alt.X("Time (hr):Q", title="Time (hr)"), + y=alt.Y( + "critical_mass_equivalents:Q", + title="Factors of critical initiation mass", + ), + ) + ) - # Add reference lines for critical N values - reference_lines = ( - alt.Chart( - pl.DataFrame({"y": CRITICAL_N, "label": [f"N={n}" for n in CRITICAL_N]}) + # Reference lines for critical N values + reference_data = pl.DataFrame( + {"y": CRITICAL_N, "label": [f"N={n}" for n in CRITICAL_N]} + ) + + reference_lines = ( + alt.Chart(reference_data.to_pandas()) + .mark_rule(strokeDash=[5, 5], color="gray", opacity=0.7) + .encode(y="y:Q") ) - .mark_rule(strokeDash=[5, 5], color="black") - .encode(y="y:Q") - ) - critical_plot = (base_critical_plot + reference_lines).properties( - title="Factors of Critical Initiation Mass", width=600, height=100 + # Text labels for reference lines + reference_labels = ( + alt.Chart(reference_data.to_pandas()) + .mark_text(align="left", dx=5, fontSize=10, color="gray") + .encode(y="y:Q", text="label:N") + .transform_calculate(x="0") + .encode(x=alt.X("x:Q")) + ) + + return (base_plot + reference_lines + reference_labels).properties( + title="Factors of Critical Initiation Mass", width=600, height=100 + ) + + def create_mass_plot(column_key: str, title: str, y_title: str): + """Create a generic mass plot.""" + if not column_mapping[column_key]: + return None + + return ( + alt.Chart(df.to_pandas()) + .mark_line(strokeWidth=2) + .encode( + x=alt.X("Time (hr):Q", title="Time (hr)"), + y=alt.Y(f"{column_mapping[column_key]}:Q", title=y_title), + ) + .properties(title=title, width=600, height=100) + ) + + # Generate all plots + plots = [] + + # 1. Fork positions + fork_plot = create_fork_positions_plot() + if fork_plot: + plots.append(fork_plot) + + # 2. Pairs of forks + pairs_plot = create_pairs_of_forks_plot() + if pairs_plot: + plots.append(pairs_plot) + + # 3. Critical mass equivalents + critical_plot = create_critical_mass_plot() + if critical_plot: + plots.append(critical_plot) + + # 4. Dry mass + dry_mass_plot = create_mass_plot("dry_mass", "Dry Mass", "Dry mass (fg)") + if dry_mass_plot: + plots.append(dry_mass_plot) + + # 5. Number of oriC + oric_plot = create_mass_plot("number_of_oric", "Number of oriC", "Number of oriC") + if oric_plot: + plots.append(oric_plot) + + # 6. Critical mass per oriC + mass_per_oric_plot = create_mass_plot( + "critical_mass_per_oric", "Critical Mass per oriC", "Critical mass per oriC" ) + if mass_per_oric_plot: + plots.append(mass_per_oric_plot) - # 4. Dry mass plot - dry_mass_plot = df.plot.line( - x="Time (hr)", - y=alt.Y("listenersmassdryMass", title="Dry mass (fg)"), - ).properties(title="Dry Mass", width=600, height=100) - - # 5. Number of oriC plot - oric_plot = df.plot.line( - x="Time (hr)", - y=alt.Y("listenersreplication_datanumberOfOric", title="Number of oriC"), - ).properties(title="Number of oriC", width=600, height=100) - - # 6. Critical mass per oriC plot - mass_per_oric_plot = df.plot.line( - x="Time (hr)", - y=alt.Y( - "listenersreplication_datacriticalMassPerOriC", - title="Critical mass per oriC", - ), - ).properties(title="Critical Mass per oriC", width=600, height=100) - - # Combine all plots vertically - combined_plot = alt.vconcat( - fork_plot, - pairs_plot, - critical_plot, - dry_mass_plot, - oric_plot, - mass_per_oric_plot, - ).resolve_scale(x="shared") + # Combine plots or create fallback + if plots: + combined_plot = alt.vconcat(*plots).resolve_scale(x="shared") + print(f"Created visualization with {len(plots)} subplots") + else: + # Fallback plot if no data available + fallback_data = pl.DataFrame( + {"x": [0], "y": [0], "text": ["No data available for plotting"]} + ) + combined_plot = ( + alt.Chart(fallback_data.to_pandas()) + .mark_text(fontSize=20, color="red") + .encode(x=alt.X("x:Q", axis=None), y=alt.Y("y:Q", axis=None), text="text:N") + .properties(width=600, height=400, title="Replication Data Visualization") + ) + print("No plottable data found - created fallback message") # Save the plot - combined_plot.save(os.path.join(outdir, "replication.html")) + output_path = os.path.join(outdir, "replication_report.html") + combined_plot.save(output_path) + print(f"Saved visualization to: {output_path}") + + return combined_plot diff --git a/ecoli/analysis/multigeneration/ribosome_components.py b/ecoli/analysis/multigeneration/ribosome_components.py index bf1ba6321..ee93d02b4 100644 --- a/ecoli/analysis/multigeneration/ribosome_components.py +++ b/ecoli/analysis/multigeneration/ribosome_components.py @@ -1,11 +1,10 @@ import altair as alt import os -from typing import Any +from typing import Any, Dict from duckdb import DuckDBPyConnection import pickle import polars as pl -import numpy as np from ecoli.library.parquet_emitter import ( field_metadata, @@ -16,266 +15,145 @@ def plot( - params: dict[str, Any], + params: Dict[str, Any], conn: DuckDBPyConnection, history_sql: str, config_sql: str, success_sql: str, - sim_data_dict: dict[str, dict[int, str]], + sim_data_dict: Dict[str, Dict[int, str]], validation_data_paths: list[str], outdir: str, - variant_metadata: dict[str, dict[int, Any]], - variant_names: dict[str, str], + variant_metadata: Dict[str, Dict[int, Any]], + variant_names: Dict[str, str], ): - """ - Plots the timetrace of counts for each of the components of the ribosomal - subunits (rRNAs and ribosomal proteins). - """ - - # Load sim_data + # Load simulation data with open_arbitrary_sim_data(sim_data_dict) as f: sim_data = pickle.load(f) - # Load IDs of ribosome components from sim_data + # Extract molecule IDs for ribosomal subunits s30_protein_ids = sim_data.molecule_groups.s30_proteins s30_16s_rRNA_ids = sim_data.molecule_groups.s30_16s_rRNA - s30_full_complex_id = [sim_data.molecule_ids.s30_full_complex] + s30_full_complex_id = sim_data.molecule_ids.s30_full_complex s50_protein_ids = sim_data.molecule_groups.s50_proteins s50_23s_rRNA_ids = sim_data.molecule_groups.s50_23s_rRNA s50_5s_rRNA_ids = sim_data.molecule_groups.s50_5s_rRNA - s50_full_complex_id = [sim_data.molecule_ids.s50_full_complex] + s50_full_complex_id = sim_data.molecule_ids.s50_full_complex - # Get complexation stoichiometries of ribosomal proteins + # Retrieve stoichiometry for each protein subunit complexation = sim_data.process.complexation - s30_monomers = complexation.get_monomers(s30_full_complex_id[0]) - s50_monomers = complexation.get_monomers(s50_full_complex_id[0]) - s30_subunit_id_to_stoich = { - subunit_id: stoich - for (subunit_id, stoich) in zip( - s30_monomers["subunitIds"], s30_monomers["subunitStoich"] - ) - } - s50_subunit_id_to_stoich = { - subunit_id: stoich - for (subunit_id, stoich) in zip( - s50_monomers["subunitIds"], s50_monomers["subunitStoich"] - ) - } - s30_protein_stoich = np.array( - [s30_subunit_id_to_stoich[subunit_id] for subunit_id in s30_protein_ids] - ) - s50_protein_stoich = np.array( - [s50_subunit_id_to_stoich[subunit_id] for subunit_id in s50_protein_ids] - ) - - # Get metadata for extracting indexes - unique_molecule_metadata = field_metadata( - conn, config_sql, "listeners__unique_molecule_counts" - ) - monomer_metadata = field_metadata(conn, config_sql, "listeners__monomer_counts") - - # Extract indexes - active_ribosome_index = unique_molecule_metadata.index("active_ribosome") - - monomer_id_to_index = { - monomer_id: i for (i, monomer_id) in enumerate(monomer_metadata) - } - s30_protein_indexes = [ - monomer_id_to_index[protein_id] for protein_id in s30_protein_ids + s30_info = complexation.get_monomers(s30_full_complex_id) + s50_info = complexation.get_monomers(s50_full_complex_id) + s30_stoich = dict(zip(s30_info["subunitIds"], s30_info["subunitStoich"])) + s50_stoich = dict(zip(s50_info["subunitIds"], s50_info["subunitStoich"])) + + # Map bulk IDs to SQL column indices + bulk_ids = field_metadata(conn, config_sql, "bulk") + bulk_index = {mid: idx for idx, mid in enumerate(bulk_ids)} + + # Determine column indexes in SQL for rRNAs and complexes + s30_16s_idx = [bulk_index[i] for i in s30_16s_rRNA_ids if i in bulk_index] + s50_23s_idx = [bulk_index[i] for i in s50_23s_rRNA_ids if i in bulk_index] + s50_5s_idx = [bulk_index[i] for i in s50_5s_rRNA_ids if i in bulk_index] + s30_complex_idx = bulk_index[s30_full_complex_id] + s50_complex_idx = bulk_index[s50_full_complex_id] + + # Map monomer counts IDs to SQL column indices + mono_ids = field_metadata(conn, config_sql, "listeners__monomer_counts") + mono_index = {mid: idx for idx, mid in enumerate(mono_ids)} + s30_protein_idx = [mono_index[i] for i in s30_protein_ids if i in mono_index] + s50_protein_idx = [mono_index[i] for i in s50_protein_ids if i in mono_index] + + # Build named_idx spec for reading + bulk_cols = [ + named_idx("bulk", s30_16s_rRNA_ids, [s30_16s_idx]), + named_idx("bulk", s50_23s_rRNA_ids, [s50_23s_idx]), + named_idx("bulk", s50_5s_rRNA_ids, [s50_5s_idx]), + named_idx("bulk", [s30_full_complex_id], [[s30_complex_idx]]), + named_idx("bulk", [s50_full_complex_id], [[s50_complex_idx]]), ] - s50_protein_indexes = [ - monomer_id_to_index[protein_id] for protein_id in s50_protein_ids + protein_cols = [ + named_idx("listeners__monomer_counts", [pid], [[idx]]) + for pid, idx in zip( + s30_protein_ids + s50_protein_ids, s30_protein_idx + s50_protein_idx + ) ] - - # Define named indexes for bulk molecules - s30_16s_rRNAs = named_idx( - "listeners__bulk_molecules", - s30_16s_rRNA_ids, - list(range(len(s30_16s_rRNA_ids))), # Will be resolved by named_idx - ) - s50_23s_rRNAs = named_idx( - "listeners__bulk_molecules", - s50_23s_rRNA_ids, - list(range(len(s50_23s_rRNA_ids))), - ) - s50_5s_rRNAs = named_idx( - "listeners__bulk_molecules", s50_5s_rRNA_ids, list(range(len(s50_5s_rRNA_ids))) + additional = ["listeners__unique_molecule_counts__active_ribosome", "time"] + cols = bulk_cols + protein_cols + additional + + # Read time-series data + data = read_stacked_columns(history_sql, cols, conn=conn) + df = pl.DataFrame(data).with_columns(Time_min=pl.col("time") / 60) + + # Sum rRNA counts horizontally + s30_16s = pl.sum_horizontal([pl.col(i) for i in s30_16s_rRNA_ids]) + s50_23s = pl.sum_horizontal([pl.col(i) for i in s50_23s_rRNA_ids]) + s50_5s = pl.sum_horizontal([pl.col(i) for i in s50_5s_rRNA_ids]) + + # Extract complex and active ribosome counts + s30_complex = pl.col(s30_full_complex_id) + s50_complex = pl.col(s50_full_complex_id) + active_ribo = pl.col("listeners__unique_molecule_counts__active_ribosome") + + # Adjust protein counts by stoichiometry + for pid in s30_protein_ids: + df = df.with_columns(**{f"adj_s30_{pid}": pl.col(pid) / s30_stoich[pid]}) + for pid in s50_protein_ids: + df = df.with_columns(**{f"adj_s50_{pid}": pl.col(pid) / s50_stoich[pid]}) + + # Determine limiting protein across subunits + s30_lim = pl.min_horizontal([pl.col(f"adj_s30_{pid}") for pid in s30_protein_ids]) + s50_lim = pl.min_horizontal([pl.col(f"adj_s50_{pid}") for pid in s50_protein_ids]) + + # Calculate total rRNA including complexes and active ribosomes + df = df.with_columns( + s30_16s_total=s30_16s + s30_complex + active_ribo, + s50_23s_total=s50_23s + s50_complex + active_ribo, + s50_5s_total=s50_5s + s50_complex + active_ribo, + s30_limiting=s30_lim, + s50_limiting=s50_lim, + s30_total=s30_complex + active_ribo, + s50_total=s50_complex + active_ribo, ) - s30_full_complex = named_idx("listeners__bulk_molecules", s30_full_complex_id, [0]) - s50_full_complex = named_idx("listeners__bulk_molecules", s50_full_complex_id, [0]) - # Named indexes for monomer counts - s30_proteins = named_idx( - "listeners__monomer_counts", s30_protein_ids, s30_protein_indexes - ) - s50_proteins = named_idx( - "listeners__monomer_counts", s50_protein_ids, s50_protein_indexes - ) + # Prepare data for plotting by melting into long format + plot_cols_30 = ["s30_limiting", "s30_16s_total", "s30_total"] + plot_cols_50 = ["s50_limiting", "s50_23s_total", "s50_5s_total", "s50_total"] - # Named index for active ribosomes - active_ribosomes = named_idx( - "listeners__unique_molecule_counts", - ["active_ribosome"], - [active_ribosome_index], + melt_30 = df.select(["Time_min"] + plot_cols_30).melt( + id_vars="Time_min", variable_name="component", value_name="count" ) - - # Load data - ribosome_data = read_stacked_columns( - history_sql, - [ - s30_16s_rRNAs, - s50_23s_rRNAs, - s50_5s_rRNAs, - s30_full_complex, - s50_full_complex, - s30_proteins, - s50_proteins, - active_ribosomes, - ], - conn=conn, + melt_50 = df.select(["Time_min"] + plot_cols_50).melt( + id_vars="Time_min", variable_name="component", value_name="count" ) - # Convert to DataFrame and add time column - df = pl.DataFrame(ribosome_data).with_columns(**{"Time (min)": pl.col("time") / 60}) - - # Calculate protein counts divided by stoichiometry - s30_protein_counts_cols = [] - for i, protein_id in enumerate(s30_protein_ids): - col_name = f"s30_protein_{i}_normalized" - df = df.with_columns( - (pl.col(protein_id) / s30_protein_stoich[i]).alias(col_name) - ) - s30_protein_counts_cols.append(col_name) - - s50_protein_counts_cols = [] - for i, protein_id in enumerate(s50_protein_ids): - col_name = f"s50_protein_{i}_normalized" - df = df.with_columns( - (pl.col(protein_id) / s50_protein_stoich[i]).alias(col_name) - ) - s50_protein_counts_cols.append(col_name) - - # Calculate limiting protein counts and total counts - df = df.with_columns( - [ - # S30 calculations - pl.min_horizontal(s30_protein_counts_cols).alias( - "s30_limiting_protein_counts" - ), - ( - pl.sum_horizontal(s30_16s_rRNA_ids) - + pl.col(s30_full_complex_id[0]) - + pl.col("active_ribosome") - ).alias("s30_16s_rRNA_total_counts"), - (pl.col(s30_full_complex_id[0]) + pl.col("active_ribosome")).alias( - "s30_total_counts" - ), - # S50 calculations - pl.min_horizontal(s50_protein_counts_cols).alias( - "s50_limiting_protein_counts" - ), - ( - pl.sum_horizontal(s50_23s_rRNA_ids) - + pl.col(s50_full_complex_id[0]) - + pl.col("active_ribosome") - ).alias("s50_23s_rRNA_total_counts"), - ( - pl.sum_horizontal(s50_5s_rRNA_ids) - + pl.col(s50_full_complex_id[0]) - + pl.col("active_ribosome") - ).alias("s50_5s_rRNA_total_counts"), - (pl.col(s50_full_complex_id[0]) + pl.col("active_ribosome")).alias( - "s50_total_counts" - ), - ] - ) - - # Create plots - # 30S components plot - s30_plot = ( - alt.Chart(df) + # Create 30S components chart with legend + chart_30 = ( + alt.Chart(melt_30) .mark_line() .encode( - x=alt.X("Time (min):Q").title("Time (min)"), - y=alt.Y("value:Q").title("30S component counts").scale(domain=[0, 60000]), - color=alt.Color("variable:N").title("Component"), - strokeDash=alt.StrokeDash("variable:N").scale( - domain=[ - "s30_limiting_protein_counts", - "s30_16s_rRNA_total_counts", - "s30_total_counts", - ], - range=[[5, 5], [0], [3, 3]], - ), - ) - .transform_fold( - [ - "s30_limiting_protein_counts", - "s30_16s_rRNA_total_counts", - "s30_total_counts", - ], - as_=["variable", "value"], - ) - .transform_calculate( - variable_label="datum.variable === 's30_limiting_protein_counts' ? 'limiting r-protein' : " - "datum.variable === 's30_16s_rRNA_total_counts' ? '16S rRNA' : '30S subunit'" + x="Time_min", + y="count", + color=alt.Color("component", title="30S Components"), ) - .encode( - color=alt.Color("variable_label:N") - .title("Component") - .scale( - domain=["limiting r-protein", "16S rRNA", "30S subunit"], - range=["#cccccc", "#1f77b4", "#000000"], - ) - ) - .properties(title="30S Ribosomal Subunit Components", width=600, height=200) + .properties(title="30S Component Counts", width=600) ) - # 50S components plot - s50_plot = ( - alt.Chart(df) + # Create 50S components chart with legend + chart_50 = ( + alt.Chart(melt_50) .mark_line() .encode( - x=alt.X("Time (min):Q").title("Time (min)"), - y=alt.Y("value:Q").title("50S component counts").scale(domain=[0, 60000]), - color=alt.Color("variable:N").title("Component"), - strokeDash=alt.StrokeDash("variable:N").scale( - domain=[ - "s50_limiting_protein_counts", - "s50_23s_rRNA_total_counts", - "s50_5s_rRNA_total_counts", - "s50_total_counts", - ], - range=[[5, 5], [0], [0], [3, 3]], - ), - ) - .transform_fold( - [ - "s50_limiting_protein_counts", - "s50_23s_rRNA_total_counts", - "s50_5s_rRNA_total_counts", - "s50_total_counts", - ], - as_=["variable", "value"], - ) - .transform_calculate( - variable_label="datum.variable === 's50_limiting_protein_counts' ? 'limiting r-protein' : " - "datum.variable === 's50_23s_rRNA_total_counts' ? '23S rRNA' : " - "datum.variable === 's50_5s_rRNA_total_counts' ? '5S rRNA' : '50S subunit'" - ) - .encode( - color=alt.Color("variable_label:N") - .title("Component") - .scale( - domain=["limiting r-protein", "23S rRNA", "5S rRNA", "50S subunit"], - range=["#cccccc", "#ff7f0e", "#2ca02c", "#000000"], - ) + x="Time_min", + y="count", + color=alt.Color("component", title="50S Components"), ) - .properties(title="50S Ribosomal Subunit Components", width=600, height=200) + .properties(title="50S Component Counts", width=600) ) - # Combine plots vertically - combined_plot = alt.vconcat(s30_plot, s50_plot).resolve_scale(color="independent") - - # Save the plot - combined_plot.save(os.path.join(outdir, "ribosome_components.html")) + # Combine and save charts + combined = ( + alt.vconcat(chart_30, chart_50) + .resolve_scale(color="independent") + .resolve_legend(color="independent") + ) + combined.save(os.path.join(outdir, "ribosome_components.html")) From 8cc9be9713392ae505a5627e2ce0540039858dcc Mon Sep 17 00:00:00 2001 From: Shu-Feather <2200017797@stu.pku.edu.cn> Date: Fri, 27 Jun 2025 05:39:28 +0800 Subject: [PATCH 05/71] Delete personal Config --- configs/hsy/genAnalysis.json | 20 ---- configs/hsy/multigen.json | 184 ----------------------------- configs/hsy/multiseed.json | 181 ---------------------------- configs/hsy/multivariant.json | 196 ------------------------------- configs/hsy/test.json | 19 --- configs/hsy/variantAnalysis.json | 20 ---- 6 files changed, 620 deletions(-) delete mode 100644 configs/hsy/genAnalysis.json delete mode 100644 configs/hsy/multigen.json delete mode 100644 configs/hsy/multiseed.json delete mode 100644 configs/hsy/multivariant.json delete mode 100644 configs/hsy/test.json delete mode 100644 configs/hsy/variantAnalysis.json diff --git a/configs/hsy/genAnalysis.json b/configs/hsy/genAnalysis.json deleted file mode 100644 index eb9ccb45c..000000000 --- a/configs/hsy/genAnalysis.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "emitter_arg": { - "out_dir": "out" - }, - "analysis_options": { - "experiment_id": ["multigen"], - "variant": [0], - "lineage_seed": [0], - "generation": [1, 2, 3, 4, 5], - "agent_id": ["0"], - "variant_data_dir": ["out/multigen/variant_sim_data"], - "validation_data_path": ["out/multigen/parca/kb/validationData.cPickle"], - "outdir": "out/multigen/analyses", - "cpus": 4, - - "multigeneration": { - "ribosome_components": {} - } - } -} diff --git a/configs/hsy/multigen.json b/configs/hsy/multigen.json deleted file mode 100644 index 949bd7fb4..000000000 --- a/configs/hsy/multigen.json +++ /dev/null @@ -1,184 +0,0 @@ -{ - "inherit_from": [], - "experiment_id": "multigen", - "suffix_time": false, - "description": "A customized simulation for testing new parameters", - "progress_bar": true, - "sim_data_path": null, - "emitter": "parquet", - "emitter_arg": { - "out_dir": "out" - }, - "analysis_options": { - "single": {"mass_fraction_summary": {}}, - "multigeneration": { - "replication": {}, - "new_gene_counts": {}, - "ribosome_components": {}, - "ribosome_crowding": {}, - "ribosomeProduction": {}, - "ribosomeUsage": {}, - "rna_decay_03_high": {} - } - }, - "emit_topology": false, - "emit_processes": false, - "emit_config": false, - "emit_unique": false, - "log_updates": false, - "raw_output": true, - "seed": 0, - "mar_regulon": false, - "amp_lysis": false, - - "initial_state_file": "", - "initial_state_overrides": [], - "initial_state": {}, - "time_step": 1.0, - "total_time": 10800.0, - "initial_global_time": 0.0, - "fail_at_total_time": false, - - "variants": {}, - "skip_baseline": false, - "n_init_sims": 1, - "generations": 5, - "single_daughters": true, - "daughter_outdir": "out", - "lineage_seed": 0, - - "parca_options": { - "cpus": 6, - "outdir": "out", - "operons": true, - "ribosome_fitting": true, - "rnapoly_fitting": true, - "remove_rrna_operons": false, - "remove_rrff": false, - "stable_rrna": false, - "new_genes": "off", - "debug_parca": false, - "load_intermediate": null, - "save_intermediates": false, - "intermediates_directory": "", - "variable_elongation_transcription": true, - "variable_elongation_translation": false - }, - - "agent_id": "0", - "divide": true, - "d_period": true, - "division_threshold": true, - "division_variable": ["divide"], - "chromosome_path": ["unique", "full_chromosome"], - "spatial_environment": false, - "spatial_environment_config": {}, - "fixed_media": "minimal", - "condition": "basal", - - "save": false, - "save_times": [], - - "add_processes": [], - "exclude_processes": [], - "swap_processes": {}, - "profile": false, - "processes": [ - "post-division-mass-listener", - - "bulk-timeline", - "media_update", - "exchange_data", - - "ecoli-tf-unbinding", - - "ecoli-equilibrium", - "ecoli-two-component-system", - "ecoli-rna-maturation", - - "ecoli-tf-binding", - - "ecoli-transcript-initiation", - "ecoli-polypeptide-initiation", - "ecoli-chromosome-replication", - "ecoli-protein-degradation", - "ecoli-rna-degradation", - "ecoli-complexation", - - "ecoli-transcript-elongation", - "ecoli-polypeptide-elongation", - - "ecoli-chromosome-structure", - - "ecoli-metabolism", - - "ecoli-mass-listener", - "RNA_counts_listener", - "rna_synth_prob_listener", - "monomer_counts_listener", - "dna_supercoiling_listener", - "replication_data_listener", - "rnap_data_listener", - "unique_molecule_counts", - "ribosome_data_listener", - "global_clock" - ], - "process_configs": { - "global_clock": {}, - "replication_data_listener": {"time_step": 1} - }, - - "topology": { - "bulk-timeline": { - "bulk": ["bulk"], - "global": ["timeline"], - "media_id": ["environment", "media_id"] - }, - "global_clock": { - "global_time": ["global_time"], - "next_update_time": ["next_update_time"] - } - }, - - "flow": { - "post-division-mass-listener": [], - "media_update": [["post-division-mass-listener"]], - "exchange_data": [["media_update"]], - - "ecoli-tf-unbinding": [["media_update"]], - - "ecoli-equilibrium": [["ecoli-tf-unbinding"]], - "ecoli-two-component-system": [["ecoli-tf-unbinding"]], - "ecoli-rna-maturation": [["ecoli-tf-unbinding"]], - - "ecoli-tf-binding": [["ecoli-equilibrium"]], - - "ecoli-transcript-initiation": [["ecoli-tf-binding"]], - "ecoli-polypeptide-initiation": [["ecoli-tf-binding"]], - "ecoli-chromosome-replication": [["ecoli-tf-binding"]], - "ecoli-protein-degradation": [["ecoli-tf-binding"]], - "ecoli-rna-degradation": [["ecoli-tf-binding"]], - "ecoli-complexation": [["ecoli-tf-binding"]], - - "ecoli-transcript-elongation": [["ecoli-complexation"]], - "ecoli-polypeptide-elongation": [["ecoli-complexation"]], - - "ecoli-chromosome-structure": [["ecoli-polypeptide-elongation"]], - - "ecoli-metabolism": [["ecoli-chromosome-structure"]], - - "ecoli-mass-listener": [["ecoli-metabolism"]], - "RNA_counts_listener": [["ecoli-metabolism"]], - "rna_synth_prob_listener": [["ecoli-metabolism"]], - "monomer_counts_listener": [["ecoli-metabolism"]], - "dna_supercoiling_listener": [["ecoli-metabolism"]], - "replication_data_listener": [["ecoli-metabolism"]], - "rnap_data_listener": [["ecoli-metabolism"]], - "unique_molecule_counts": [["ecoli-metabolism"]], - "ribosome_data_listener": [["ecoli-metabolism"]] - }, - "engine_process_reports": [ - ["listeners"] - ], - "emit_paths": [] -} \ No newline at end of file diff --git a/configs/hsy/multiseed.json b/configs/hsy/multiseed.json deleted file mode 100644 index a06c91d04..000000000 --- a/configs/hsy/multiseed.json +++ /dev/null @@ -1,181 +0,0 @@ -{ - "inherit_from": [], - "experiment_id": "multiseed", - "suffix_time": false, - "description": "A customized simulation for testing new parameters", - "progress_bar": true, - "sim_data_path": null, - "emitter": "parquet", - "emitter_arg": { - "out_dir": "out" - }, - "analysis_options": { - "single": {"mass_fraction_summary": {}}, - "multiseed": { - "ecocyc_table": {}, - "protein_counts_validation": {}, - "ribosome_spacing":{}, - "subgenerational_expression_table": {} - } - }, - "emit_topology": false, - "emit_processes": false, - "emit_config": false, - "emit_unique": false, - "log_updates": false, - "raw_output": true, - "seed": 0, - "mar_regulon": false, - "amp_lysis": false, - - "initial_state_file": "", - "initial_state_overrides": [], - "initial_state": {}, - "time_step": 1.0, - "total_time": 10800.0, - "initial_global_time": 0.0, - "fail_at_total_time": false, - - "variants": {}, - "skip_baseline": false, - "n_init_sims": 1, - "generations": 5, - "single_daughters": true, - "daughter_outdir": "out", - "lineage_seed": 0, - - "parca_options": { - "cpus": 6, - "outdir": "out", - "operons": true, - "ribosome_fitting": true, - "rnapoly_fitting": true, - "remove_rrna_operons": false, - "remove_rrff": false, - "stable_rrna": false, - "new_genes": "off", - "debug_parca": false, - "load_intermediate": null, - "save_intermediates": false, - "intermediates_directory": "", - "variable_elongation_transcription": true, - "variable_elongation_translation": false - }, - - "agent_id": "0", - "divide": true, - "d_period": true, - "division_threshold": true, - "division_variable": ["divide"], - "chromosome_path": ["unique", "full_chromosome"], - "spatial_environment": false, - "spatial_environment_config": {}, - "fixed_media": "minimal", - "condition": "basal", - - "save": false, - "save_times": [], - - "add_processes": [], - "exclude_processes": [], - "swap_processes": {}, - "profile": false, - "processes": [ - "post-division-mass-listener", - - "bulk-timeline", - "media_update", - "exchange_data", - - "ecoli-tf-unbinding", - - "ecoli-equilibrium", - "ecoli-two-component-system", - "ecoli-rna-maturation", - - "ecoli-tf-binding", - - "ecoli-transcript-initiation", - "ecoli-polypeptide-initiation", - "ecoli-chromosome-replication", - "ecoli-protein-degradation", - "ecoli-rna-degradation", - "ecoli-complexation", - - "ecoli-transcript-elongation", - "ecoli-polypeptide-elongation", - - "ecoli-chromosome-structure", - - "ecoli-metabolism", - - "ecoli-mass-listener", - "RNA_counts_listener", - "rna_synth_prob_listener", - "monomer_counts_listener", - "dna_supercoiling_listener", - "replication_data_listener", - "rnap_data_listener", - "unique_molecule_counts", - "ribosome_data_listener", - "global_clock" - ], - "process_configs": { - "global_clock": {}, - "replication_data_listener": {"time_step": 1} - }, - - "topology": { - "bulk-timeline": { - "bulk": ["bulk"], - "global": ["timeline"], - "media_id": ["environment", "media_id"] - }, - "global_clock": { - "global_time": ["global_time"], - "next_update_time": ["next_update_time"] - } - }, - - "flow": { - "post-division-mass-listener": [], - "media_update": [["post-division-mass-listener"]], - "exchange_data": [["media_update"]], - - "ecoli-tf-unbinding": [["media_update"]], - - "ecoli-equilibrium": [["ecoli-tf-unbinding"]], - "ecoli-two-component-system": [["ecoli-tf-unbinding"]], - "ecoli-rna-maturation": [["ecoli-tf-unbinding"]], - - "ecoli-tf-binding": [["ecoli-equilibrium"]], - - "ecoli-transcript-initiation": [["ecoli-tf-binding"]], - "ecoli-polypeptide-initiation": [["ecoli-tf-binding"]], - "ecoli-chromosome-replication": [["ecoli-tf-binding"]], - "ecoli-protein-degradation": [["ecoli-tf-binding"]], - "ecoli-rna-degradation": [["ecoli-tf-binding"]], - "ecoli-complexation": [["ecoli-tf-binding"]], - - "ecoli-transcript-elongation": [["ecoli-complexation"]], - "ecoli-polypeptide-elongation": [["ecoli-complexation"]], - - "ecoli-chromosome-structure": [["ecoli-polypeptide-elongation"]], - - "ecoli-metabolism": [["ecoli-chromosome-structure"]], - - "ecoli-mass-listener": [["ecoli-metabolism"]], - "RNA_counts_listener": [["ecoli-metabolism"]], - "rna_synth_prob_listener": [["ecoli-metabolism"]], - "monomer_counts_listener": [["ecoli-metabolism"]], - "dna_supercoiling_listener": [["ecoli-metabolism"]], - "replication_data_listener": [["ecoli-metabolism"]], - "rnap_data_listener": [["ecoli-metabolism"]], - "unique_molecule_counts": [["ecoli-metabolism"]], - "ribosome_data_listener": [["ecoli-metabolism"]] - }, - "engine_process_reports": [ - ["listeners"] - ], - "emit_paths": [] -} \ No newline at end of file diff --git a/configs/hsy/multivariant.json b/configs/hsy/multivariant.json deleted file mode 100644 index 25d40defa..000000000 --- a/configs/hsy/multivariant.json +++ /dev/null @@ -1,196 +0,0 @@ -{ - "inherit_from": [], - "experiment_id": "multivariant", - "suffix_time": false, - "description": "A customized simulation for testing new parameters", - "progress_bar": true, - "sim_data_path": null, - "emitter": "parquet", - "emitter_arg": { - "out_dir": "out" - }, - "analysis_options": { - "single": {"mass_fraction_summary": {}}, - "multivariant": { - "average_monomer_counts": {}, - "doubling_time_hist": {}, - "doubling_time_line": {}, - "dummy": {}, - "new_gene_translation_efficiency_heatmaps": {} - } - }, - "emit_topology": false, - "emit_processes": false, - "emit_config": false, - "emit_unique": false, - "log_updates": false, - "raw_output": true, - "seed": 0, - "mar_regulon": false, - "amp_lysis": false, - - "initial_state_file": "", - "initial_state_overrides": [], - "initial_state": {}, - "time_step": 1.0, - "total_time": 10800.0, - "initial_global_time": 0.0, - "fail_at_total_time": false, - - "variants": { - "variant_test": { - "a": {"value": [1, 2]}, - "b": {"value": ["one", "two"]}, - "c": { - "nested": { - "d": {"value": [3, 4]}, - "e": {"value": [5.0, 6.0]}, - "op": "zip" - } - }, - "op": "prod" - } - }, - - "skip_baseline": false, - "n_init_sims": 1, - "generations": 2, - "single_daughters": true, - "daughter_outdir": "out", - "lineage_seed": 0, - - "parca_options": { - "cpus": 6, - "outdir": "out", - "operons": true, - "ribosome_fitting": true, - "rnapoly_fitting": true, - "remove_rrna_operons": false, - "remove_rrff": false, - "stable_rrna": false, - "new_genes": "off", - "debug_parca": false, - "load_intermediate": null, - "save_intermediates": false, - "intermediates_directory": "", - "variable_elongation_transcription": true, - "variable_elongation_translation": false - }, - - "agent_id": "0", - "divide": true, - "d_period": true, - "division_threshold": true, - "division_variable": ["divide"], - "chromosome_path": ["unique", "full_chromosome"], - "spatial_environment": false, - "spatial_environment_config": {}, - "fixed_media": "minimal", - "condition": "basal", - - "save": false, - "save_times": [], - - "add_processes": [], - "exclude_processes": [], - "swap_processes": {}, - "profile": false, - "processes": [ - "post-division-mass-listener", - - "bulk-timeline", - "media_update", - "exchange_data", - - "ecoli-tf-unbinding", - - "ecoli-equilibrium", - "ecoli-two-component-system", - "ecoli-rna-maturation", - - "ecoli-tf-binding", - - "ecoli-transcript-initiation", - "ecoli-polypeptide-initiation", - "ecoli-chromosome-replication", - "ecoli-protein-degradation", - "ecoli-rna-degradation", - "ecoli-complexation", - - "ecoli-transcript-elongation", - "ecoli-polypeptide-elongation", - - "ecoli-chromosome-structure", - - "ecoli-metabolism", - - "ecoli-mass-listener", - "RNA_counts_listener", - "rna_synth_prob_listener", - "monomer_counts_listener", - "dna_supercoiling_listener", - "replication_data_listener", - "rnap_data_listener", - "unique_molecule_counts", - "ribosome_data_listener", - "global_clock" - ], - "process_configs": { - "global_clock": {}, - "replication_data_listener": {"time_step": 1} - }, - - "topology": { - "bulk-timeline": { - "bulk": ["bulk"], - "global": ["timeline"], - "media_id": ["environment", "media_id"] - }, - "global_clock": { - "global_time": ["global_time"], - "next_update_time": ["next_update_time"] - } - }, - - "flow": { - "post-division-mass-listener": [], - "media_update": [["post-division-mass-listener"]], - "exchange_data": [["media_update"]], - - "ecoli-tf-unbinding": [["media_update"]], - - "ecoli-equilibrium": [["ecoli-tf-unbinding"]], - "ecoli-two-component-system": [["ecoli-tf-unbinding"]], - "ecoli-rna-maturation": [["ecoli-tf-unbinding"]], - - "ecoli-tf-binding": [["ecoli-equilibrium"]], - - "ecoli-transcript-initiation": [["ecoli-tf-binding"]], - "ecoli-polypeptide-initiation": [["ecoli-tf-binding"]], - "ecoli-chromosome-replication": [["ecoli-tf-binding"]], - "ecoli-protein-degradation": [["ecoli-tf-binding"]], - "ecoli-rna-degradation": [["ecoli-tf-binding"]], - "ecoli-complexation": [["ecoli-tf-binding"]], - - "ecoli-transcript-elongation": [["ecoli-complexation"]], - "ecoli-polypeptide-elongation": [["ecoli-complexation"]], - - "ecoli-chromosome-structure": [["ecoli-polypeptide-elongation"]], - - "ecoli-metabolism": [["ecoli-chromosome-structure"]], - - "ecoli-mass-listener": [["ecoli-metabolism"]], - "RNA_counts_listener": [["ecoli-metabolism"]], - "rna_synth_prob_listener": [["ecoli-metabolism"]], - "monomer_counts_listener": [["ecoli-metabolism"]], - "dna_supercoiling_listener": [["ecoli-metabolism"]], - "replication_data_listener": [["ecoli-metabolism"]], - "rnap_data_listener": [["ecoli-metabolism"]], - "unique_molecule_counts": [["ecoli-metabolism"]], - "ribosome_data_listener": [["ecoli-metabolism"]] - }, - "engine_process_reports": [ - ["listeners"] - ], - "emit_paths": [] -} \ No newline at end of file diff --git a/configs/hsy/test.json b/configs/hsy/test.json deleted file mode 100644 index 6b775b732..000000000 --- a/configs/hsy/test.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "experiment_id": "test_multigen_replication", - "suffix_time": false, - "parca_options": { - "cpus": 4 - }, - "fail_at_total_time": true, - "sim_data_path": null, - "generations": 3, - "n_init_sims": 1, - "single_daughters": true, - "emitter": "parquet", - "emitter_arg": { - "out_dir": "out" - }, - "analysis_options": { - "multigeneration": {"replication": {}} - } -} diff --git a/configs/hsy/variantAnalysis.json b/configs/hsy/variantAnalysis.json deleted file mode 100644 index 5718d6e37..000000000 --- a/configs/hsy/variantAnalysis.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "emitter_arg": { - "out_dir": "out" - }, - "analysis_options": { - "experiment_id": ["multivariant"], - "variant": [0, 1, 2, 3, 4, 5, 6, 7, 8], - "lineage_seed": [0], - "generation": [1, 2], - "agent_id": ["0"], - "validation_data_path": ["out/multivariant/parca/kb/validationData.cPickle"], - "variant_data_dir": ["out/multivariant/variant_sim_data"], - "outdir": "out/multivariant/analyses", - "cpus": 4, - - "multivariant": { - "average_monomer_counts": {} - } - } -} From 024d0bbcb7792f36239cf9481d9565384b159348 Mon Sep 17 00:00:00 2001 From: Sean Cheah Date: Fri, 27 Jun 2025 12:16:09 -0700 Subject: [PATCH 06/71] Add actions read permission to Docker image security scan --- .github/workflows/docker_security.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/docker_security.yml b/.github/workflows/docker_security.yml index 124b1f78a..5ebc098c6 100644 --- a/.github/workflows/docker_security.yml +++ b/.github/workflows/docker_security.yml @@ -4,6 +4,7 @@ permissions: contents: read security-events: write pull-requests: write + actions: read on: push: From 37096ed767f34a134bce4abd8c09d38bb614ce6d Mon Sep 17 00:00:00 2001 From: Sean Cheah Date: Fri, 27 Jun 2025 12:18:18 -0700 Subject: [PATCH 07/71] Test docker scan in PR --- .github/workflows/docker_security.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/docker_security.yml b/.github/workflows/docker_security.yml index 5ebc098c6..43ec8c6f9 100644 --- a/.github/workflows/docker_security.yml +++ b/.github/workflows/docker_security.yml @@ -10,6 +10,8 @@ on: push: branches: - master + pull_request: + branches: [master] schedule: - cron: "0 0 * * *" # Runs daily at midnight UTC From bc63c13bcb3177e86e8e4c75bb04e36ae45f7118 Mon Sep 17 00:00:00 2001 From: Sean Cheah Date: Sat, 28 Jun 2025 00:36:17 -0700 Subject: [PATCH 08/71] More informative error message for existing output directory --- runscripts/workflow.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/runscripts/workflow.py b/runscripts/workflow.py index 201304cfa..3a2e25911 100644 --- a/runscripts/workflow.py +++ b/runscripts/workflow.py @@ -423,14 +423,25 @@ def main(): config["lineage_seed"] = random.randint(0, 2**31 - 1) filesystem, outdir = parse_uri(out_uri) outdir = os.path.join(outdir, experiment_id, "nextflow") + exp_outdir = os.path.join(outdir, experiment_id) out_uri = os.path.join(out_uri, experiment_id, "nextflow") repo_dir = os.path.dirname(os.path.dirname(__file__)) local_outdir = os.path.join(repo_dir, "nextflow_temp", experiment_id) os.makedirs(local_outdir, exist_ok=True) if filesystem is None: - os.makedirs(outdir, exist_ok=args.resume) + if os.path.exists(exp_outdir) and not args.resume: + raise RuntimeError( + f"Output directory already exists: {exp_outdir}. " + "Please use a different experiment ID or output directory. " + "Alternatively, move, delete, or rename the existing directory." + ) else: - filesystem.makedirs(outdir, exist_ok=args.resume) + if filesystem.exists(exp_outdir) and not args.resume: + raise RuntimeError( + f"Output directory already exists: {exp_outdir}. " + "Please use a different experiment ID or output directory. " + "Alternatively, move, delete, or rename the existing directory." + ) temp_config_path = f"{local_outdir}/workflow_config.json" final_config_path = os.path.join(outdir, "workflow_config.json") final_config_uri = os.path.join(out_uri, "workflow_config.json") From 673924c8bc7794f8a6841314d706b301210c2c00 Mon Sep 17 00:00:00 2001 From: Sean Cheah Date: Sat, 28 Jun 2025 00:36:53 -0700 Subject: [PATCH 09/71] Fix typo --- runscripts/workflow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runscripts/workflow.py b/runscripts/workflow.py index 3a2e25911..0208f254c 100644 --- a/runscripts/workflow.py +++ b/runscripts/workflow.py @@ -409,7 +409,7 @@ def main(): else: out_uri = config["emitter_arg"]["out_uri"] parsed_uri = parse.urlparse(out_uri) - if parsed_uri.schema not in ("local", "file") and not FSSPEC_AVAILABLE: + if parsed_uri.scheme not in ("local", "file") and not FSSPEC_AVAILABLE: raise RuntimeError( f"URI '{out_uri}' specified but fsspec is not available. " "Install fsspec or provide a local URI/out directory." From 2527d9505159602f8126dc6ebc499d561cfb01ad Mon Sep 17 00:00:00 2001 From: Sean Cheah Date: Sat, 28 Jun 2025 02:06:49 -0700 Subject: [PATCH 10/71] Add curl for authentication on Google Cloud --- runscripts/container/Dockerfile | 5 ++++- runscripts/container/Singularity | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/runscripts/container/Dockerfile b/runscripts/container/Dockerfile index ff326c6c9..d18b93150 100644 --- a/runscripts/container/Dockerfile +++ b/runscripts/container/Dockerfile @@ -14,8 +14,11 @@ RUN echo "alias ls='ls --color=auto'" >> ~/.bashrc \ && echo "alias ll='ls -l'" >> ~/.bashrc \ && cp ~/.bashrc / +# gcc necessary for compiling C extensions in some Python packages. # procps necessary for `ps` command used by Nextflow to track processes. -RUN apt-get update && apt-get install -y gcc procps nano +# nano is a text editor for convenience. +# curl is necessary for authentication on Google Cloud VMs +RUN apt-get update && apt-get install -y gcc procps nano curl # Install the project into `/vEcoli` WORKDIR /vEcoli diff --git a/runscripts/container/Singularity b/runscripts/container/Singularity index 201536342..6df713271 100644 --- a/runscripts/container/Singularity +++ b/runscripts/container/Singularity @@ -27,6 +27,6 @@ From: ghcr.io/astral-sh/uv@sha256:1cc0392c8aad8026ef3922e3f997fff0f31e506b0ffe95 FILES_TO_ADD %post - apt-get update && apt-get install -y gcc procps nano + apt-get update && apt-get install -y gcc procps nano curl cd /vEcoli UV_CACHE_DIR="/vEcoli/.uv_cache" UV_COMPILE_BYTECODE=1 uv sync --frozen From cf57b3152d611cbfae5ba5d0d70c4e6fefefc1b8 Mon Sep 17 00:00:00 2001 From: Sean Cheah Date: Sat, 28 Jun 2025 02:30:25 -0700 Subject: [PATCH 11/71] Clarify Google Cloud setup --- README.md | 2 +- doc/gcloud.rst | 33 ++++++++++++++++++++------------- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index a000b1b55..2aa1603d7 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,7 @@ If your system has git, curl (or wget), and a C compiler On Ubuntu/Debian, apt can be used to install all three prerequisites: - sudo -s eval 'apt update && apt install git curl clang' + sudo -s eval 'apt update && apt install -y git curl clang' On MacOS, curl is preinstalled and git and clang come with the Xcode Command Line Tools: diff --git a/doc/gcloud.rst b/doc/gcloud.rst index 6080f579f..9f7553264 100644 --- a/doc/gcloud.rst +++ b/doc/gcloud.rst @@ -123,21 +123,28 @@ right service account and project. Next, install Git and clone the vEcoli reposi .. code-block:: bash - sudo apt update && sudo apt install git - git clone https://github.com/CovertLab/vEcoli.git + # zip and unzip necessary to install SDKMAN to get Java for nextflow + sudo apt update && sudo apt install -y git zip unzip + git clone https://github.com/CovertLab/vEcoli.git --filter=blob:none + cd vEcoli -Now follow the installation instructions from the README starting with -installing ``uv`` and finishing with installing Nextflow. +`Install uv `_, then +create a new virtual environment and install GCSFS: -.. note:: - Technically, the only requirements to run :mod:`runscripts.workflow` on Google Cloud - are Nextflow, Python 3.9+, and `GCSFS `_. - The workflow steps will be run inside Docker containers (see - :ref:`docker-images`). The other Python requirements can be - omitted for a more minimal installation. You will need to use - :ref:`interactive containers ` to run the model using - any interface other than :mod:`runscripts.workflow`, but this may be a good - thing for maximum reproducibility. +.. code-block:: bash + + source ~/.bashrc + uv venv + uv pip install gcsfs + +Run the following to automatically activate the virtual environment: + +.. code-block:: bash + + echo "source ~/vEcoli/.venv/bin/activate" >> ~/.bashrc + source ~/.bashrc + +Finally, `install Nextflow `_. ------------------ Create Your Bucket From b54750c2c0986dc6742dc1f250633aeb8786fb58 Mon Sep 17 00:00:00 2001 From: Sean Cheah Date: Sat, 28 Jun 2025 02:50:05 -0700 Subject: [PATCH 12/71] Make outdirs for Nextflow config and workflow files to be copied --- runscripts/workflow.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/runscripts/workflow.py b/runscripts/workflow.py index 0208f254c..6f8d119ec 100644 --- a/runscripts/workflow.py +++ b/runscripts/workflow.py @@ -435,6 +435,7 @@ def main(): "Please use a different experiment ID or output directory. " "Alternatively, move, delete, or rename the existing directory." ) + os.makedirs(outdir, exist_ok=True) else: if filesystem.exists(exp_outdir) and not args.resume: raise RuntimeError( @@ -442,6 +443,7 @@ def main(): "Please use a different experiment ID or output directory. " "Alternatively, move, delete, or rename the existing directory." ) + filesystem.makedirs(outdir, exist_ok=True) temp_config_path = f"{local_outdir}/workflow_config.json" final_config_path = os.path.join(outdir, "workflow_config.json") final_config_uri = os.path.join(out_uri, "workflow_config.json") From 43bac0d58563709e7ea9c0238b195cd88ad2f452 Mon Sep 17 00:00:00 2001 From: Sean Cheah Date: Sat, 28 Jun 2025 03:00:55 -0700 Subject: [PATCH 13/71] Fix outdir determination --- runscripts/workflow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runscripts/workflow.py b/runscripts/workflow.py index 6f8d119ec..c906b57df 100644 --- a/runscripts/workflow.py +++ b/runscripts/workflow.py @@ -423,7 +423,7 @@ def main(): config["lineage_seed"] = random.randint(0, 2**31 - 1) filesystem, outdir = parse_uri(out_uri) outdir = os.path.join(outdir, experiment_id, "nextflow") - exp_outdir = os.path.join(outdir, experiment_id) + exp_outdir = os.path.dirname(outdir) out_uri = os.path.join(out_uri, experiment_id, "nextflow") repo_dir = os.path.dirname(os.path.dirname(__file__)) local_outdir = os.path.join(repo_dir, "nextflow_temp", experiment_id) From 223e5337d18a421c91159b4cacec74128546a69d Mon Sep 17 00:00:00 2001 From: Sean Cheah Date: Sat, 28 Jun 2025 05:02:49 -0700 Subject: [PATCH 14/71] Fixes to ParquetEmitter finalization GCSFS schedules future, which does not work in atexit callbacks, so we revert back to PyArrow. Additionally, we now use os._exit to explicitly set an exit code of 1 when an exception is raised during finalization because Python ignores exceptions in atexit callbacks by default. --- ecoli/library/parquet_emitter.py | 100 ++++++++++++++++++------------- 1 file changed, 58 insertions(+), 42 deletions(-) diff --git a/ecoli/library/parquet_emitter.py b/ecoli/library/parquet_emitter.py index 2257f2045..74b93e663 100644 --- a/ecoli/library/parquet_emitter.py +++ b/ecoli/library/parquet_emitter.py @@ -1,5 +1,6 @@ import atexit import os +import sys from concurrent.futures import Future, ThreadPoolExecutor from typing import Any, Callable, cast, Mapping, Optional from urllib import parse @@ -7,9 +8,9 @@ import duckdb import numpy as np import polars as pl +from pyarrow import fs from polars.datatypes import DataTypeClass from fsspec.core import filesystem, url_to_fs, OpenFile -from fsspec.spec import AbstractFileSystem from tqdm import tqdm from vivarium.core.emitter import Emitter @@ -65,7 +66,7 @@ def json_to_parquet( emit_dict: dict[str, np.ndarray | list[pl.Series]], outfile: str, schema: dict[str, Any], - filesystem: AbstractFileSystem, + filesystem: fs.FileSystem, ): """Convert dictionary to Parquet. @@ -74,7 +75,7 @@ def json_to_parquet( or lists of Polars Series (variable-shape). outfile: Path to output Parquet file. Can be local path or URI. schema: Full mapping of column names to Polars dtypes. - filesystem: On local filesystem, fsspec filesystem needed to + filesystem: On local filesystem, PyArrow filesystem needed to write Parquet file atomically. """ tbl = pl.DataFrame(emit_dict, schema={k: schema[k] for k in emit_dict}) @@ -82,11 +83,13 @@ def json_to_parquet( # trying to read partially written Parquet files. Get around this by writing # to a temporary file and then renaming it to the final output file. temp_outfile = outfile - if parse.urlparse(outfile).scheme in ("", "file", "local"): + parsed_outfile = parse.urlparse(outfile) + if parsed_outfile.scheme in ("", "file", "local"): + outfile = os.path.join(parsed_outfile.netloc, parsed_outfile.path) temp_outfile = outfile + ".tmp" tbl.write_parquet(temp_outfile, statistics=False) if temp_outfile != outfile: - filesystem.mv(temp_outfile, outfile) + filesystem.move(temp_outfile, outfile) def union_by_name(query_sql: str) -> str: @@ -835,8 +838,7 @@ def __init__(self, config: dict[str, Any]) -> None: self.out_uri = os.path.abspath(config["out_dir"]) else: self.out_uri = config["out_uri"] - self.filesystem: AbstractFileSystem - self.filesystem, _ = url_to_fs(self.out_uri) + self.filesystem, self.out_dir = fs.FileSystem.from_uri(self.out_uri) self.batch_size = config.get("batch_size", 400) self.threaded = config.get("threaded", True) if self.threaded: @@ -866,41 +868,54 @@ def _finalize(self): this is done by :py:class:`~ecoli.experiments.ecoli_master_sim.EcoliSim` upon reaching division. """ - # Wait for last batch to finish writing - self.last_batch_future.result() - # Flush any remaining buffered emits to Parquet - outfile = os.path.join( - self.out_uri, - self.experiment_id, - "history", - self.partitioning_path, - f"{self.num_emits}.pq", - ) - self.filesystem.makedirs(os.path.dirname(outfile), exist_ok=True) - if not self.filesystem.exists(outfile): - for k, v in self.buffered_emits.items(): - self.buffered_emits[k] = v[: self.num_emits % self.batch_size] - json_to_parquet( - self.buffered_emits, outfile, self.pl_types, self.filesystem - ) - # Hive-partitioned directory that only contains successful sims - if self.success: - success_file = os.path.join( + try: + # Wait for last batch to finish writing + self.last_batch_future.result() + # Flush any remaining buffered emits to Parquet + outfile = os.path.join( self.out_uri, self.experiment_id, - "success", + "history", self.partitioning_path, - "s.pq", + f"{self.num_emits}.pq", ) - try: - self.filesystem.delete(os.path.dirname(success_file), recursive=True) - except (FileNotFoundError, OSError): - pass - self.filesystem.makedirs(os.path.dirname(success_file)) - pl.DataFrame({"success": [True]}).write_parquet( - success_file, - statistics=False, + # PyArrow filesystem requires path, not URI + self.filesystem.create_dir( + self.out_dir + os.path.dirname(outfile)[len(self.out_uri) :] ) + # Write remaining buffered emits + if self.num_emits % self.batch_size != 0: + for k, v in self.buffered_emits.items(): + self.buffered_emits[k] = v[: self.num_emits % self.batch_size] + json_to_parquet( + self.buffered_emits, outfile, self.pl_types, self.filesystem + ) + # Hive-partitioned directory that only contains successful sims + if self.success: + success_file = os.path.join( + self.out_uri, + self.experiment_id, + "success", + self.partitioning_path, + "s.pq", + ) + success_dir = ( + self.out_dir + os.path.dirname(success_file)[len(self.out_uri) :] + ) + try: + self.filesystem.delete_dir(success_dir) + except (FileNotFoundError, OSError): + pass + self.filesystem.create_dir(success_dir) + pl.DataFrame({"success": [True]}).write_parquet( + success_file, + statistics=False, + ) + except Exception as e: + # Since Python ignores exceptions in atexit callbacks, + # we need to explicitly set the exit code + print(f"Error during ParquetEmitter finalization: {e}", file=sys.stderr) + os._exit(1) def emit(self, data: dict[str, Any]): """ @@ -971,13 +986,14 @@ def emit(self, data: dict[str, Any]): self.partitioning_path, "config.pq", ) + config_dir = self.out_dir + os.path.dirname(outfile)[len(self.out_uri) :] # Cleanup any existing output files from previous runs then # create new folder for config / simulation output try: - self.filesystem.delete(os.path.dirname(outfile), recursive=True) + self.filesystem.delete_dir(config_dir) except (FileNotFoundError, OSError): pass - self.filesystem.makedirs(os.path.dirname(outfile)) + self.filesystem.create_dir(config_dir) self.last_batch_future = self.executor.submit( json_to_parquet, config_emit, @@ -987,13 +1003,13 @@ def emit(self, data: dict[str, Any]): ) # Delete any sim output files in final filesystem history_outdir = os.path.join( - self.out_uri, self.experiment_id, "history", self.partitioning_path + self.out_dir, self.experiment_id, "history", self.partitioning_path ) try: - self.filesystem.delete(history_outdir, recursive=True) + self.filesystem.delete_dir(history_outdir) except (FileNotFoundError, OSError): pass - self.filesystem.makedirs(history_outdir) + self.filesystem.create_dir(history_outdir) return # Each Engine that uses this emitter should only simulate a single cell # In lineage simulations, StopAfterDivision Step will terminate From 54731fcd82c2df32a3fceeb8065fa044638b6ef4 Mon Sep 17 00:00:00 2001 From: Sean Cheah Date: Sat, 28 Jun 2025 05:03:49 -0700 Subject: [PATCH 15/71] Give sim chance to finish cleanly in runscripts/sim.py --- runscripts/sim.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/runscripts/sim.py b/runscripts/sim.py index 3f602cdde..28820644e 100644 --- a/runscripts/sim.py +++ b/runscripts/sim.py @@ -1,4 +1,5 @@ import os +import signal import sys import subprocess @@ -16,8 +17,15 @@ def main(): # Forward all arguments cmd = [sys.executable, script_path] + sys.argv[1:] # Execute and forward exit code - result = subprocess.run(cmd) - return result.returncode + proc = subprocess.Popen(cmd) + try: + proc.wait() + # Give subprocess chance to finish cleanly + except Exception as e: + proc.send_signal(signal.SIGINT) + proc.wait() + raise e + return proc.returncode if __name__ == "__main__": From 839521ece0efd114bf85a5809de43439e5290785 Mon Sep 17 00:00:00 2001 From: Shu-Feather <2200017797@stu.pku.edu.cn> Date: Sun, 29 Jun 2025 01:08:48 +0800 Subject: [PATCH 16/71] Add multigeneration analysis focused on replication, transcription and translation --- ecoli/analysis/multigeneration/replication.py | 10 + .../multigeneration/ribosomeProduction.py | 643 +++++++++------- .../analysis/multigeneration/ribosomeUsage.py | 713 +++++++++++------- .../multigeneration/ribosome_components.py | 12 +- .../multigeneration/ribosome_crowding.py | 434 ++++++----- 5 files changed, 1065 insertions(+), 747 deletions(-) diff --git a/ecoli/analysis/multigeneration/replication.py b/ecoli/analysis/multigeneration/replication.py index 512e4b431..fb1ef7d39 100644 --- a/ecoli/analysis/multigeneration/replication.py +++ b/ecoli/analysis/multigeneration/replication.py @@ -1,3 +1,11 @@ +""" +The multigeneration aanlysis method `replication` +1. Record the DNA polymerase position vs time +2. Record # of pairs of replication forks +3. Record the factors of critical initial mass and dry mass +4. Record # of oriC +""" + import altair as alt import os from typing import Any @@ -14,6 +22,8 @@ CRITICAL_N = [1, 2, 4, 8] +# ----------------------------------------- # + def plot( params: dict[str, Any], diff --git a/ecoli/analysis/multigeneration/ribosomeProduction.py b/ecoli/analysis/multigeneration/ribosomeProduction.py index 70635bf75..82b94fb4a 100644 --- a/ecoli/analysis/multigeneration/ribosomeProduction.py +++ b/ecoli/analysis/multigeneration/ribosomeProduction.py @@ -1,3 +1,11 @@ +""" +Record several things: +1. normalised dry mass over time +2. cell, 5S RNA, 16S RNA, and 23S rRNA doubling time (be calculated use the `log(2)` formulation) +3. 5S RNA, 16S RNA, and 23S rRNA initiation probability +4. Ribosome elongation rate +""" + import altair as alt import os from typing import Any @@ -5,12 +13,98 @@ import polars as pl import numpy as np from duckdb import DuckDBPyConnection +import pandas as pd from ecoli.library.parquet_emitter import ( - field_metadata, open_arbitrary_sim_data, ) +# ----------------------------------------- # + + +def make_get_bulk_counts(sim_data): + """ + Create a function to extract counts of specified bulk molecules using sim_data indices. + + Args: + sim_data: Simulation data object containing molecule IDs and related information. + + Returns: + A function that takes a DataFrame and list of molecule IDs and returns their total counts. + """ + # Get Molecular ID from bulk_molecules + try: + molecule_ids_list = sim_data.internal_state.bulk_molecules.bulk_data[ + "id" + ].tolist() + except AttributeError: + raise ValueError("[ERROR] Check the structure of `sim_data`") + + mol_id_to_index = {mol_id: idx for idx, mol_id in enumerate(molecule_ids_list)} + + def get_bulk_counts(df, molecule_ids): + """ + Extract total counts of specified molecule IDs from the 'bulk' column. + + Args: + df: Polars DataFrame with a 'bulk' column containing Series of counts. + molecule_ids: List of molecule IDs to sum (e.g., s30_16s_rRNA). + + Returns: + Polars Series with total counts for each row. + """ + indices = [] + for mol_id in molecule_ids: + if mol_id in mol_id_to_index: + indices.append(mol_id_to_index[mol_id]) + else: + print(f"warning: molecular ID '{mol_id}' is missing") + + return ( + df["bulk"] + .map_elements( + lambda counts_series: ( + sum(counts_series[i] for i in indices if i < len(counts_series)) + if isinstance(counts_series, pl.Series) + else 0 + ), + return_dtype=pl.Float64, + ) + .fill_null(0) + ) + + return get_bulk_counts + + +def get_unique_counts(df, molecule_type): + """Get counts of unique molecules (e.g., active ribosomes) from listeners.""" + col_name = f"listeners__unique_molecule_counts__{molecule_type}" + if col_name in df.columns: + return df[col_name].fill_null(0) + return pl.Series(np.zeros(len(df), dtype=np.int64)) + + +# Calculate the RNA doubling times +def calc_rna_doubling_time(produced_col, count_col, borderline): + production_rate = pl.col(produced_col) / pl.col("time_step_sec") + growth_rate = production_rate / pl.col(count_col) + doubling_time_min = np.log(2) / growth_rate / 60.0 + + # data sanitation + valid_condition = ( + (pl.col(produced_col) >= 0) + & (pl.col(count_col) > 0) + & (growth_rate > 0) + & doubling_time_min.is_finite() + & (doubling_time_min > 0) + & (doubling_time_min < 2 * borderline) + ) + + return pl.when(valid_condition).then(doubling_time_min).otherwise(pl.lit(None)) + + +# ----------------------------------------- # + def plot( params: dict[str, Any], @@ -24,92 +118,23 @@ def plot( variant_metadata: dict[str, dict[int, Any]], variant_names: dict[str, str], ): + """Visualize ribosome production metrics for E. coli simulation.""" # Load sim_data with open_arbitrary_sim_data(sim_data_dict) as f: sim_data = pickle.load(f) - # Get expected doubling time - expected_doubling_time = sim_data.doubling_time - - # Get ribosomal RNA IDs - ids_16s = [] - ids_16s.extend(sim_data.molecule_groups.s30_16s_rRNA) - ids_16s.append(sim_data.molecule_ids.s30_full_complex) - - ids_23s = [] - ids_23s.extend(sim_data.molecule_groups.s50_23s_rRNA) - ids_23s.append(sim_data.molecule_ids.s50_full_complex) - - ids_5s = [] - ids_5s.extend(sim_data.molecule_groups.s50_5s_rRNA) - ids_5s.append(sim_data.molecule_ids.s50_full_complex) - - # Get indices for ribosomal RNAs - bulk_molecule_ids = field_metadata(conn, config_sql, "listeners__bulk_molecules") - ids_16s_indexes = [ - bulk_molecule_ids.index(mol_id) - for mol_id in ids_16s - if mol_id in bulk_molecule_ids - ] - ids_23s_indexes = [ - bulk_molecule_ids.index(mol_id) - for mol_id in ids_23s - if mol_id in bulk_molecule_ids - ] - ids_5s_indexes = [ - bulk_molecule_ids.index(mol_id) - for mol_id in ids_5s - if mol_id in bulk_molecule_ids - ] + # Get expected doubling time in minutes + sim_doubling_time_min = sim_data.doubling_time.asNumber() - # Get unique molecule index for active ribosome - unique_molecule_ids = field_metadata( - conn, config_sql, "listeners__unique_molecules" - ) - ribosome_index = ( - unique_molecule_ids.index("active_ribosome") - if "active_ribosome" in unique_molecule_ids - else None - ) - - # Get cistron indices for rRNA - cistron_ids = [ - cistron["id"] for cistron in sim_data.process.transcription.cistron_data - ] - - idx_16s = [] - for id16s in sim_data.molecule_groups.s30_16s_rRNA: - cistron_id = id16s[:-3] # Remove _rna suffix - if cistron_id in cistron_ids: - idx_16s.append(cistron_ids.index(cistron_id)) - - idx_23s = [] - for id23s in sim_data.molecule_groups.s50_23s_rRNA: - cistron_id = id23s[:-3] # Remove _rna suffix - if cistron_id in cistron_ids: - idx_23s.append(cistron_ids.index(cistron_id)) - - idx_5s = [] - for id5s in sim_data.molecule_groups.s50_5s_rRNA: - cistron_id = id5s[:-3] # Remove _rna suffix - if cistron_id in cistron_ids: - idx_5s.append(cistron_ids.index(cistron_id)) - - # Calculate expected initiation probabilities - condition = sim_data.condition - cistron_synth_prob = sim_data.process.transcription.cistron_tu_mapping_matrix.dot( - sim_data.process.transcription.rna_synth_prob[condition] - ) - - rrn16s_fit_init_prob = cistron_synth_prob[idx_16s].sum() if idx_16s else 0 - rrn23s_fit_init_prob = cistron_synth_prob[idx_23s].sum() if idx_23s else 0 - rrn5s_fit_init_prob = cistron_synth_prob[idx_5s].sum() if idx_5s else 0 - - # Define columns to read - columns_to_read = [ + required_columns = [ "time", + "variant", + "generation", + "agent_id", + "experiment_id", + "lineage_seed", "listeners__mass__instantaneous_growth_rate", - "listeners__main__timeStepSec", + "listeners__mass__dry_mass", "listeners__ribosome_data__rRNA16S_initiated", "listeners__ribosome_data__rRNA23S_initiated", "listeners__ribosome_data__rRNA5S_initiated", @@ -117,228 +142,294 @@ def plot( "listeners__ribosome_data__rRNA23S_init_prob", "listeners__ribosome_data__rRNA5S_init_prob", "listeners__ribosome_data__total_rna_init", - "listeners__ribosome_data__effectiveElongationRate", + "listeners__ribosome_data__effective_elongation_rate", + "listeners__unique_molecule_counts__active_ribosome", + "bulk", ] - # Add bulk molecule columns - if ids_16s_indexes: - for idx in ids_16s_indexes: - columns_to_read.append(f"listeners__bulk_molecules__{idx}") - if ids_23s_indexes: - for idx in ids_23s_indexes: - columns_to_read.append(f"listeners__bulk_molecules__{idx}") - if ids_5s_indexes: - for idx in ids_5s_indexes: - columns_to_read.append(f"listeners__bulk_molecules__{idx}") - - # Add unique molecule column - if ribosome_index is not None: - columns_to_read.append(f"listeners__unique_molecules__{ribosome_index}") - - # Read data - data_df = conn.execute(f""" - SELECT {", ".join(columns_to_read)} - FROM ({history_sql}) - ORDER BY variant_idx, generation, agent_id, time - """).pl() + s30_16s_rRNA = list(sim_data.molecule_groups.s30_16s_rRNA) + [ + sim_data.molecule_ids.s30_full_complex + ] + s50_23s_rRNA = list(sim_data.molecule_groups.s50_23s_rRNA) + [ + sim_data.molecule_ids.s50_full_complex + ] + s50_5s_rRNA = list(sim_data.molecule_groups.s50_5s_rRNA) + [ + sim_data.molecule_ids.s50_full_complex + ] + + # Check available columns + available_columns = ( + conn.sql(f"DESCRIBE ({history_sql})").pl()["column_name"].to_list() + ) + data_columns = [col for col in required_columns if col in available_columns] - # Group by first cell of each generation (assuming agent_id=0 is first cell) - first_cell_data = data_df.filter(pl.col("agent_id") == 0) + print( + f"[INFO] Loading {len(data_columns)} columns for ribosome production analysis" + ) - # Calculate derived metrics - time_min = first_cell_data["time"] / 60 + df = conn.sql(f""" + SELECT {", ".join(data_columns)} + FROM ({history_sql}) + WHERE agent_id = 0 + ORDER BY variant, generation, time + """).pl() + df = df.rename({"variant": "variant_id", "generation": "generation_index"}) + + # Convert time from seconds to minutes + df = df.with_columns((pl.col("time") / 60).alias("time_min")) + + # Calculate mass doubling time + if "listeners__mass__instantaneous_growth_rate" in df.columns: + df = df.with_columns( + doubling_time_min=( + np.log(2) / pl.col("listeners__mass__instantaneous_growth_rate") + ) + / 60 + ) - # Growth rate and doubling time - growth_rate = first_cell_data["listeners__mass__instantaneous_growth_rate"] - doubling_time = np.log(2) / growth_rate + # Create get_bulk_counts function with sim_data + get_bulk_counts_func = make_get_bulk_counts(sim_data) # Calculate rRNA counts - rrn16s_bulk = ( - sum( - [ - first_cell_data[f"listeners__bulk_molecules__{idx}"] - for idx in ids_16s_indexes - ] - ) - if ids_16s_indexes - else pl.lit(0) - ) - rrn23s_bulk = ( - sum( - [ - first_cell_data[f"listeners__bulk_molecules__{idx}"] - for idx in ids_23s_indexes - ] - ) - if ids_23s_indexes - else pl.lit(0) - ) - rrn5s_bulk = ( - sum( - [ - first_cell_data[f"listeners__bulk_molecules__{idx}"] - for idx in ids_5s_indexes - ] - ) - if ids_5s_indexes - else pl.lit(0) + df = df.with_columns( + [ + get_bulk_counts_func(df, s30_16s_rRNA).alias("bulk_16s_count"), + get_bulk_counts_func(df, s50_23s_rRNA).alias("bulk_23s_count"), + get_bulk_counts_func(df, s50_5s_rRNA).alias("bulk_5s_count"), + get_unique_counts(df, "active_ribosome").alias("ribosome_count"), + ] ) - if ribosome_index is not None: - ribosome_count = first_cell_data[ - f"listeners__unique_molecules__{ribosome_index}" + # Total rRNA = bulk rRNA + rRNA in active ribosomes + df = df.with_columns( + [ + (pl.col("bulk_16s_count") + pl.col("ribosome_count")).alias("rrn16s_count"), + (pl.col("bulk_23s_count") + pl.col("ribosome_count")).alias("rrn23s_count"), + (pl.col("bulk_5s_count") + pl.col("ribosome_count")).alias("rrn5s_count"), ] - rrn16s_count = rrn16s_bulk + ribosome_count - rrn23s_count = rrn23s_bulk + ribosome_count - rrn5s_count = rrn5s_bulk + ribosome_count - else: - rrn16s_count = rrn16s_bulk - rrn23s_count = rrn23s_bulk - rrn5s_count = rrn5s_bulk - - # Calculate rRNA doubling times - time_step = first_cell_data["listeners__main__timeStepSec"] - - rrn16s_produced = first_cell_data["listeners__ribosome_data__rRNA16S_initiated"] - rrn23s_produced = first_cell_data["listeners__ribosome_data__rRNA23S_initiated"] - rrn5s_produced = first_cell_data["listeners__ribosome_data__rRNA5S_initiated"] - - # Avoid division by zero - rrn16s_doubling_time = ( - pl.when(rrn16s_produced > 0) - .then(np.log(2) / ((1 / time_step) * (rrn16s_produced / rrn16s_count))) - .otherwise(None) - / 60 - ) # Convert to minutes - - rrn23s_doubling_time = ( - pl.when(rrn23s_produced > 0) - .then(np.log(2) / ((1 / time_step) * (rrn23s_produced / rrn23s_count))) - .otherwise(None) - / 60 ) - rrn5s_doubling_time = ( - pl.when(rrn5s_produced > 0) - .then(np.log(2) / ((1 / time_step) * (rrn5s_produced / rrn5s_count))) - .otherwise(None) - / 60 + # Calculate time step + df = df.with_columns( + pl.col("time") + .diff() + .over(["variant_id", "generation_index", "agent_id"]) + .alias("time_step_sec") ) - - # Prepare plotting dataframe - plot_data = first_cell_data.with_columns( - [ - time_min.alias("Time (min)"), - doubling_time.alias("Doubling Time (min)"), - rrn16s_doubling_time.alias("16S Doubling Time (min)"), - rrn23s_doubling_time.alias("23S Doubling Time (min)"), - rrn5s_doubling_time.alias("5S Doubling Time (min)"), - ( - first_cell_data["listeners__ribosome_data__rRNA16S_init_prob"] - / first_cell_data["listeners__ribosome_data__total_rna_init"] - ).alias("16S Init Prob"), - ( - first_cell_data["listeners__ribosome_data__rRNA23S_init_prob"] - / first_cell_data["listeners__ribosome_data__total_rna_init"] - ).alias("23S Init Prob"), - ( - first_cell_data["listeners__ribosome_data__rRNA5S_init_prob"] - / first_cell_data["listeners__ribosome_data__total_rna_init"] - ).alias("5S Init Prob"), - first_cell_data["listeners__ribosome_data__effectiveElongationRate"].alias( - "Elongation Rate (aa/s)" - ), - pl.lit(expected_doubling_time.as_number() / 60).alias( - "Expected Doubling Time (min)" - ), - pl.lit(rrn16s_fit_init_prob).alias("Expected 16S Init Prob"), - pl.lit(rrn23s_fit_init_prob).alias("Expected 23S Init Prob"), - pl.lit(rrn5s_fit_init_prob).alias("Expected 5S Init Prob"), - ] + df = df.with_columns( + time_step_sec=pl.when(pl.col("time_step_sec").is_null()) + .then(pl.col("time")) + .otherwise(pl.col("time_step_sec")) ) - # Create plots - base = alt.Chart(plot_data).add_selection(alt.selection_interval(bind="scales")) + if "listeners__ribosome_data__rRNA16S_initiated" in df.columns: + df = df.with_columns( + rrn16S_doubling_time_min=calc_rna_doubling_time( + "listeners__ribosome_data__rRNA16S_initiated", + "rrn16s_count", + sim_doubling_time_min, + ) + ) + if "listeners__ribosome_data__rRNA23S_initiated" in df.columns: + df = df.with_columns( + rrn23S_doubling_time_min=calc_rna_doubling_time( + "listeners__ribosome_data__rRNA23S_initiated", + "rrn23s_count", + sim_doubling_time_min, + ) + ) + if "listeners__ribosome_data__rRNA5S_initiated" in df.columns: + df = df.with_columns( + rrn5S_doubling_time_min=calc_rna_doubling_time( + "listeners__ribosome_data__rRNA5S_initiated", + "rrn5s_count", + sim_doubling_time_min, + ) + ) - # Doubling time plot - doubling_plot = base.mark_line(color="blue").encode( - x=alt.X("Time (min):Q"), - y=alt.Y("Doubling Time (min):Q", title="Doubling Time (min)"), - ) + base.mark_line(color="red", strokeDash=[5, 5]).encode( - x=alt.X("Time (min):Q"), y=alt.Y("Expected Doubling Time (min):Q") - ) + # Calculate initiation probabilities + if "listeners__ribosome_data__rRNA16S_init_prob" in df.columns: + df = df.with_columns( + rrn16S_init_prob_normalized=pl.col( + "listeners__ribosome_data__rRNA16S_init_prob" + ) + ) + if "listeners__ribosome_data__rRNA23S_init_prob" in df.columns: + df = df.with_columns( + rrn23S_init_prob_normalized=pl.col( + "listeners__ribosome_data__rRNA23S_init_prob" + ) + ) + if "listeners__ribosome_data__rRNA5S_init_prob" in df.columns: + df = df.with_columns( + rrn5S_init_prob_normalized=pl.col( + "listeners__ribosome_data__rRNA5S_init_prob" + ) + ) - # 16S doubling time plot - rrn16s_plot = base.mark_line(color="blue").encode( - x=alt.X("Time (min):Q"), - y=alt.Y("16S Doubling Time (min):Q", title="16S Doubling Time (min)"), - ) + base.mark_line(color="red", strokeDash=[5, 5]).encode( - x=alt.X("Time (min):Q"), y=alt.Y("Expected Doubling Time (min):Q") + # Calculate expected initiation probabilities + condition = sim_data.condition + transcription = sim_data.process.transcription + cistron_synth_prob = transcription.cistron_tu_mapping_matrix.dot( + transcription.rna_synth_prob[condition] ) - # 23S doubling time plot - rrn23s_plot = base.mark_line(color="blue").encode( - x=alt.X("Time (min):Q"), - y=alt.Y("23S Doubling Time (min):Q", title="23S Doubling Time (min)"), - ) + base.mark_line(color="red", strokeDash=[5, 5]).encode( - x=alt.X("Time (min):Q"), y=alt.Y("Expected Doubling Time (min):Q") + def get_cistron_prob(ids): + indices = [] + for rna_id in ids: + cistron_id = rna_id[:-3] # Remove RNA suffix + idx = np.where(transcription.cistron_data["id"] == cistron_id)[0] + if len(idx) > 0: + indices.append(idx[0]) + return cistron_synth_prob[indices].sum() if indices else 0.0 + + rrn16s_fit_init_prob = get_cistron_prob(sim_data.molecule_groups.s30_16s_rRNA) + rrn23s_fit_init_prob = get_cistron_prob(sim_data.molecule_groups.s50_23s_rRNA) + rrn5s_fit_init_prob = get_cistron_prob(sim_data.molecule_groups.s50_5s_rRNA) + + # Select columns for plotting + plot_columns = ["time_min", "variant_id", "generation_index"] + + # Add other columns + for col in [ + "listeners__mass__dry_mass", + "doubling_time_min", + "rrn16S_doubling_time_min", + "rrn23S_doubling_time_min", + "rrn5S_doubling_time_min", + "rrn16S_init_prob_normalized", + "rrn23S_init_prob_normalized", + "rrn5S_init_prob_normalized", + "listeners__ribosome_data__effective_elongation_rate", + ]: + if col in df.columns: + plot_columns.append(col) + + plot_df = df.select(plot_columns) + + # Calculate initial dry mass at time=0 for each variant and generation + initial_dry_mass = ( + plot_df.filter(pl.col("time_min") == 0) + .select(["variant_id", "listeners__mass__dry_mass"]) + .rename({"listeners__mass__dry_mass": "initial_dry_mass"}) ) - # 5S doubling time plot - rrn5s_plot = base.mark_line(color="blue").encode( - x=alt.X("Time (min):Q"), - y=alt.Y("5S Doubling Time (min):Q", title="5S Doubling Time (min)"), - ) + base.mark_line(color="red", strokeDash=[5, 5]).encode( - x=alt.X("Time (min):Q"), y=alt.Y("Expected Doubling Time (min):Q") - ) + plot_df = plot_df.join(initial_dry_mass, on=["variant_id"], how="left") - # Initiation probability plots - init_16s_plot = base.mark_line(color="blue").encode( - x=alt.X("Time (min):Q"), y=alt.Y("16S Init Prob:Q", title="16S Init Prob") - ) + base.mark_line(color="red", strokeDash=[5, 5]).encode( - x=alt.X("Time (min):Q"), y=alt.Y("Expected 16S Init Prob:Q") + plot_df = plot_df.with_columns( + (pl.col("listeners__mass__dry_mass") / pl.col("initial_dry_mass")).alias( + "dry_mass_normalized" + ) ) - init_23s_plot = base.mark_line(color="blue").encode( - x=alt.X("Time (min):Q"), y=alt.Y("23S Init Prob:Q", title="23S Init Prob") - ) + base.mark_line(color="red", strokeDash=[5, 5]).encode( - x=alt.X("Time (min):Q"), y=alt.Y("Expected 23S Init Prob:Q") - ) + # ----------------------------------------- # - init_5s_plot = base.mark_line(color="blue").encode( - x=alt.X("Time (min):Q"), y=alt.Y("5S Init Prob:Q", title="5S Init Prob") - ) + base.mark_line(color="red", strokeDash=[5, 5]).encode( - x=alt.X("Time (min):Q"), y=alt.Y("Expected 5S Init Prob:Q") - ) + def create_line_chart(y_field, title, y_title, reference=None): + base = alt.Chart(plot_df.to_pandas()) + line = base.mark_line().encode( + x=alt.X("time_min:Q", title="Time (min)"), + y=alt.Y(f"{y_field}:Q", title=y_title), + color=alt.Color("variant_id:N", legend=alt.Legend(title="Variant")), + ) + chart = line.properties(title=title, width=600, height=120) + if reference is not None: + ref_line = ( + alt.Chart(pd.DataFrame({"y": [reference]})) + .mark_rule(color="red", strokeDash=[5, 5]) + .encode(y="y:Q") + ) + return chart + ref_line + return chart + + # ----------------------------------------- # + plots = [] + + if "dry_mass_normalized" in plot_df.columns: + plots.append( + create_line_chart( + "dry_mass_normalized", + "Normalized Dry Mass Over Time", + "Dry mass (relative to t=0)", + ) + ) - # Elongation rate plot - elongation_plot = base.mark_line(color="blue").encode( - x=alt.X("Time (min):Q"), - y=alt.Y( - "Elongation Rate (aa/s):Q", title="Average Ribosome Elongation Rate (aa/s)" - ), - ) + if "doubling_time_min" in plot_df.columns: + plots.append( + create_line_chart( + "doubling_time_min", + "Cell Doubling Time", + "Doubling Time (min)", + sim_doubling_time_min, + ) + ) + + rna_types = ["16S", "23S", "5S"] + for rna in rna_types: + col_name = f"rrn{rna}_doubling_time_min" + if col_name in plot_df.columns: + plots.append( + create_line_chart( + col_name, + f"{rna} rRNA Doubling Time", + "Doubling Time (min)", + sim_doubling_time_min, + ) + ) + + init_probs = { + "16S": rrn16s_fit_init_prob, + "23S": rrn23s_fit_init_prob, + "5S": rrn5s_fit_init_prob, + } + for rna, ref_prob in init_probs.items(): + col_name = f"rrn{rna}_init_prob_normalized" + if col_name in plot_df.columns: + plots.append( + create_line_chart( + col_name, + f"{rna} rRNA Initiation Probability", + "Probability", + ref_prob, + ) + ) + + if "listeners__ribosome_data__effective_elongation_rate" in plot_df.columns: + plots.append( + create_line_chart( + "listeners__ribosome_data__effective_elongation_rate", + "Ribosome Elongation Rate", + "Amino acids/s", + ) + ) - # Combine all plots vertically - combined_plot = alt.vconcat( - doubling_plot.properties(title="Cell Doubling Time", width=600, height=100), - rrn16s_plot.properties(title="16S rRNA Doubling Time", width=600, height=100), - rrn23s_plot.properties(title="23S rRNA Doubling Time", width=600, height=100), - rrn5s_plot.properties(title="5S rRNA Doubling Time", width=600, height=100), - init_16s_plot.properties( - title="16S rRNA Initiation Probability", width=600, height=100 - ), - init_23s_plot.properties( - title="23S rRNA Initiation Probability", width=600, height=100 - ), - init_5s_plot.properties( - title="5S rRNA Initiation Probability", width=600, height=100 - ), - elongation_plot.properties( - title="Ribosome Elongation Rate", width=600, height=100 - ), - resolve=alt.Resolve(scale=alt.ScaleResolve(y="independent")), + if not plots: + fallback_df = pl.DataFrame( + { + "message": ["No data available for ribosome production visualization"], + "x": [0], + "y": [0], + } + ) + fallback_plot = ( + alt.Chart(fallback_df.to_pandas()) + .mark_text(size=20, color="red") + .encode(x="x:Q", y="y:Q", text="message:N") + .properties( + width=600, + height=400, + title="Ribosome Production Metrics - No Data Available", + ) + ) + plots.append(fallback_plot) + + combined_plot = ( + alt.vconcat(*plots) + .resolve_scale(x="shared", y="independent") + .properties(title="Ribosome Production Metrics") ) - # Save the plot - combined_plot.save(os.path.join(outdir, "ribosome_production.html")) + output_path = os.path.join(outdir, "ribosome_production_report.html") + combined_plot.save(output_path) + print(f"Saved visualization to: {output_path}") + + return combined_plot diff --git a/ecoli/analysis/multigeneration/ribosomeUsage.py b/ecoli/analysis/multigeneration/ribosomeUsage.py index 7167ab900..6b277b248 100644 --- a/ecoli/analysis/multigeneration/ribosomeUsage.py +++ b/ecoli/analysis/multigeneration/ribosomeUsage.py @@ -1,17 +1,94 @@ +""" +Record several things: +1. cell volume over time +2. total / active ribosome count and concentration +3. active ribosome molar / mass fraction +4. Ribosome activation / deactivation count +5. # of AA. be translated +6. the effective ribosome elongation rate +""" + import altair as alt import os from typing import Any - -from duckdb import DuckDBPyConnection import pickle + import polars as pl +import numpy as np +from duckdb import DuckDBPyConnection +import pandas as pd from ecoli.library.parquet_emitter import ( - field_metadata, open_arbitrary_sim_data, - read_stacked_columns, ) +# ----------------------------------------- # + + +def make_get_bulk_counts(sim_data): + """ + Create a function to extract counts of specified bulk molecules using sim_data indices. + + Args: + sim_data: Simulation data object containing molecule IDs and related information. + + Returns: + A function that takes a DataFrame and list of molecule IDs and returns their total counts. + """ + # Get Molecular ID from bulk_molecules + try: + molecule_ids_list = sim_data.internal_state.bulk_molecules.bulk_data[ + "id" + ].tolist() + except AttributeError: + raise ValueError("[ERROR] Check the structure of `sim_data`") + + mol_id_to_index = {mol_id: idx for idx, mol_id in enumerate(molecule_ids_list)} + + def get_bulk_counts(df, molecule_ids): + """ + Extract total counts of specified molecule IDs from the 'bulk' column. + + Args: + df: Polars DataFrame with a 'bulk' column containing Series of counts. + molecule_ids: List of molecule IDs to sum. + + Returns: + Polars Series with total counts for each row. + """ + indices = [] + for mol_id in molecule_ids: + if mol_id in mol_id_to_index: + indices.append(mol_id_to_index[mol_id]) + else: + print(f"warning: molecular ID '{mol_id}' is missing") + + return ( + df["bulk"] + .map_elements( + lambda counts_series: ( + sum(counts_series[i] for i in indices if i < len(counts_series)) + if isinstance(counts_series, pl.Series) + else 0 + ), + return_dtype=pl.Float64, + ) + .fill_null(0) + ) + + return get_bulk_counts + + +def get_unique_counts(df, molecule_type): + """Get counts of unique molecules from listeners.""" + col_name = f"listeners__unique_molecule_counts__{molecule_type}" + if col_name in df.columns: + return df[col_name].fill_null(0) + return pl.Series(np.zeros(len(df), dtype=np.int64)) + + +# ----------------------------------------- # + def plot( params: dict[str, Any], @@ -25,340 +102,412 @@ def plot( variant_metadata: dict[str, dict[int, Any]], variant_names: dict[str, str], ): - # Get sim data from pickle file - with open_arbitrary_sim_data(sim_data_dict) as f: - sim_data = pickle.load(f) + """Visualize ribosome usage statistics for E. coli simulation.""" + # ----------------------------------------- # - # Get ids for 30S and 50S subunits - complexIds30S = [sim_data.molecule_ids.s30_full_complex] - complexIds50S = [sim_data.molecule_ids.s50_full_complex] + fg = 1e-15 + fl = 1e-15 - # Get molecular weights for 30S and 50S subunits, and add these two for 70S - nAvogadro = sim_data.constants.n_avogadro - mw30S = sim_data.getter.get_masses(complexIds30S) - mw50S = sim_data.getter.get_masses(complexIds50S) - mw70S = mw30S + mw50S + # ----------------------------------------- # + # Load sim_data + with open_arbitrary_sim_data(sim_data_dict) as f: + sim_data = pickle.load(f) - # Load data - bulk_molecule_data = read_stacked_columns( - history_sql, - [ - "listeners__bulk_molecules__counts", - "listeners__unique_molecule_counts__unique_molecule_counts", - "listeners__ribosome_data__did_initialize", - "listeners__ribosome_data__actual_elongations", - "listeners__ribosome_data__did_terminate", - "listeners__ribosome_data__effective_elongation_rate", - "listeners__mass__cell_mass", - "time", - "time_step_sec", - ], - conn=conn, + required_columns = [ + "time", + "variant", + "generation", + "agent_id", + "experiment_id", + "lineage_seed", + "listeners__mass__instantaneous_growth_rate", + "listeners__mass__cell_mass", + "listeners__mass__volume", + "listeners__ribosome_data__did_initialize", + "listeners__ribosome_data__actual_elongations", + "listeners__ribosome_data__did_terminate", + "listeners__ribosome_data__effective_elongation_rate", + "listeners__unique_molecule_counts__active_ribosome", + "bulk", + ] + + # Get molecular IDs for ribosome subunits + complex_ids_30s = [sim_data.molecule_ids.s30_full_complex] + complex_ids_50s = [sim_data.molecule_ids.s50_full_complex] + + # Get molecular weights + n_avogadro = sim_data.constants.n_avogadro + mw_30s = sim_data.getter.get_masses(complex_ids_30s) + mw_50s = sim_data.getter.get_masses(complex_ids_50s) + mw_70s = mw_30s + mw_50s + + # Check available columns + available_columns = ( + conn.sql(f"DESCRIBE ({history_sql})").pl()["column_name"].to_list() ) + data_columns = [col for col in required_columns if col in available_columns] - # Convert to DataFrame - df = pl.DataFrame(bulk_molecule_data).with_columns( - **{ - "Time (min)": pl.col("time") / 60, - "Cell Volume (L)": (pl.col("listeners__mass__cell_mass") * 1e-15) - / sim_data.constants.cell_density, - } - ) + print(f"[INFO] Loading {len(data_columns)} columns for ribosome usage analysis") - # Get indexes for 30S and 50S subunits based on ids - bulk_molecule_ids = field_metadata( - conn, config_sql, "listeners__bulk_molecules__counts" - ) - complexIndexes30S = [bulk_molecule_ids.index(comp) for comp in complexIds30S] - complexIndexes50S = [bulk_molecule_ids.index(comp) for comp in complexIds50S] + df = conn.sql(f""" + SELECT {", ".join(data_columns)} + FROM ({history_sql}) + WHERE agent_id = 0 + ORDER BY variant, generation, time + """).pl() + df = df.rename({"variant": "variant_id", "generation": "generation_index"}) - # Get indexes for active ribosomes - unique_molecule_ids = field_metadata( - conn, config_sql, "listeners__unique_molecule_counts__unique_molecule_counts" - ) - ribosomeIndex = unique_molecule_ids.index("active_ribosome") + # Convert time from seconds to minutes + df = df.with_columns((pl.col("time") / 60).alias("time_min")) + + # Create get_bulk_counts function with sim_data + get_bulk_counts_func = make_get_bulk_counts(sim_data) - # Extract specific columns from arrays + # Calculate ribosome subunit counts df = df.with_columns( [ - pl.col("listeners__bulk_molecules__counts") - .list.get(complexIndexes30S[0]) - .alias("counts_30S"), - pl.col("listeners__bulk_molecules__counts") - .list.get(complexIndexes50S[0]) - .alias("counts_50S"), - pl.col("listeners__unique_molecule_counts__unique_molecule_counts") - .list.get(ribosomeIndex) - .alias("active_ribosome_counts"), + get_bulk_counts_func(df, complex_ids_30s).alias("counts_30s"), + get_bulk_counts_func(df, complex_ids_50s).alias("counts_50s"), + get_unique_counts(df, "active_ribosome").alias("active_ribosome_counts"), ] ) - # Calculate ribosome statistics + # Calculate total ribosome counts and fractions df = df.with_columns( [ - # Total ribosome counts ( pl.col("active_ribosome_counts") - + pl.min_horizontal([pl.col("counts_30S"), pl.col("counts_50S")]) + + pl.min_horizontal(pl.col("counts_30s"), pl.col("counts_50s")) ).alias("total_ribosome_counts"), - # Concentrations ( - (1 / nAvogadro) - * pl.col("active_ribosome_counts") - / pl.col("Cell Volume (L)") - ).alias("active_ribosome_concentration_M"), - # Masses - ((1 / nAvogadro) * pl.col("counts_30S") * mw30S).alias("mass_30S"), - ((1 / nAvogadro) * pl.col("counts_50S") * mw50S).alias("mass_50S"), - ((1 / nAvogadro) * pl.col("active_ribosome_counts") * mw70S).alias( - "active_ribosome_mass" - ), - # Rates per time*volume - ( - pl.col("listeners__ribosome_data__did_initialize") - / (pl.col("time_step_sec") * pl.col("Cell Volume (L)")) - ).alias("activations_per_time_volume"), - ( - pl.col("listeners__ribosome_data__did_terminate") - / (pl.col("time_step_sec") * pl.col("Cell Volume (L)")) - ).alias("deactivations_per_time_volume"), + pl.col("active_ribosome_counts").cast(pl.Float64) + / ( + pl.col("active_ribosome_counts") + + pl.min_horizontal(pl.col("counts_30s"), pl.col("counts_50s")) + ) + ).alias("molar_fraction_active"), ] ) - # Calculate additional derived columns + if "listeners__mass__cell_mass" in df.columns: + cell_density = sim_data.constants.cell_density.asNumber() + df = df.with_columns( + (fg * pl.col("listeners__mass__cell_mass") / cell_density).alias( + "cell_volume" + ) + ) + + # Calculate concentrations df = df.with_columns( [ - # Total ribosome concentration ( - (1 / nAvogadro) - * pl.col("total_ribosome_counts") - / pl.col("Cell Volume (L)") - ).alias("total_ribosome_concentration_M"), - # Molar fraction active + pl.col("total_ribosome_counts") + / n_avogadro.asNumber() + / pl.col("cell_volume") + ).alias("total_ribosome_concentration_mM"), ( - pl.col("active_ribosome_counts").cast(pl.Float64) - / pl.col("total_ribosome_counts") - ).alias("molar_fraction_active"), - # Total ribosome mass and mass fraction - ( - pl.col("active_ribosome_mass") + pl.col("mass_30S") + pl.col("mass_50S") - ).alias("total_ribosome_mass"), + pl.col("active_ribosome_counts") + / n_avogadro.asNumber() + / pl.col("cell_volume") + ).alias("active_ribosome_concentration_mM"), ] ) - # Calculate mass fraction active + # Calculate masses + mw_30s_value = mw_30s.asNumber() if hasattr(mw_30s, "asNumber") else float(mw_30s) + mw_50s_value = mw_50s.asNumber() if hasattr(mw_50s, "asNumber") else float(mw_50s) + mw_70s_value = mw_70s.asNumber() if hasattr(mw_70s, "asNumber") else float(mw_70s) + df = df.with_columns( [ - (pl.col("active_ribosome_mass") / pl.col("total_ribosome_mass")).alias( - "mass_fraction_active" + (pl.col("counts_30s") / n_avogadro.asNumber() * mw_30s_value).alias( + "mass_30s" + ), + (pl.col("counts_50s") / n_avogadro.asNumber() * mw_50s_value).alias( + "mass_50s" ), + ( + pl.col("active_ribosome_counts") / n_avogadro.asNumber() * mw_70s_value + ).alias("active_ribosome_mass"), ] ) - # Convert concentrations to mM df = df.with_columns( [ - (pl.col("active_ribosome_concentration_M") * 1000).alias( - "active_ribosome_concentration_mM" - ), - (pl.col("total_ribosome_concentration_M") * 1000).alias( - "total_ribosome_concentration_mM" - ), + ( + pl.col("active_ribosome_mass") + pl.col("mass_30s") + pl.col("mass_50s") + ).alias("total_ribosome_mass"), + ( + pl.col("active_ribosome_mass") + / ( + pl.col("active_ribosome_mass") + + pl.col("mass_30s") + + pl.col("mass_50s") + ) + ).alias("mass_fraction_active"), ] ) - # Create individual plots + if "time" in df.columns: + df = df.with_columns([(pl.col("time") + 1).alias("time_step_sec")]) + + # Calculate rates per time and volume + # if "time_step_sec" in df.columns and "cell_volume" in df.columns: + # df = df.with_columns([ + # (pl.col("listeners__ribosome_data__did_initialize") / + # (pl.col("time_step_sec") * pl.col("cell_volume"))).alias("activations_per_time_volume"), + # (pl.col("listeners__ribosome_data__did_terminate") / + # (pl.col("time_step_sec") * pl.col("cell_volume"))).alias("deactivations_per_time_volume") + # ]) + + if "time_step_sec" in df.columns and "cell_volume" in df.columns: + df = df.with_columns( + [ + ( + pl.col("listeners__ribosome_data__did_initialize") + / (pl.col("cell_volume") / fl) + ).alias("activations_per_volume"), + ( + pl.col("listeners__ribosome_data__did_terminate") + / (pl.col("cell_volume") / fl) + ).alias("deactivations_per_volume"), + ] + ) + + # Select columns for plotting + plot_columns = ["time_min", "variant_id", "generation_index"] + + # Add other columns that exist + for col in [ + "time_step_sec", + "cell_volume", + "total_ribosome_counts", + "total_ribosome_concentration_mM", + "active_ribosome_counts", + "active_ribosome_concentration_mM", + "molar_fraction_active", + "mass_fraction_active", + "listeners__ribosome_data__did_initialize", + "listeners__ribosome_data__did_terminate", + "activations_per_volume", + "deactivations_per_volume", + "listeners__ribosome_data__actual_elongations", + "listeners__ribosome_data__effective_elongation_rate", + ]: + if col in df.columns: + plot_columns.append(col) + + plot_df = df.select(plot_columns) + + # ----------------------------------------- # + + def create_line_chart(y_field, title, y_title, skip_first_point=False): + """Create line chart with optional skipping of first data point.""" + data = plot_df.to_pandas() + if skip_first_point: + # Group by variant and generation, skip first point of each group + filtered_data = [] + for (variant_id, generation_index), group in data.groupby( + ["variant_id", "generation_index"] + ): + if len(group) > 1: + filtered_data.append(group.iloc[1:]) + else: + filtered_data.append(group) + data = ( + pd.concat(filtered_data, ignore_index=True) if filtered_data else data + ) + + chart = ( + alt.Chart(data) + .mark_line() + .encode( + x=alt.X("time_min:Q", title="Time (min)"), + y=alt.Y(f"{y_field}:Q", title=y_title), + color=alt.Color("variant_id:N", legend=alt.Legend(title="Variant")), + ) + .properties(title=title, width=600, height=120) + ) + + return chart + + # ----------------------------------------- # plots = [] - # Time step plot - timestep_plot = ( - alt.Chart(df.to_pandas()) - .mark_line() - .encode( - x=alt.X("Time (min):Q", title="Time (min)"), - y=alt.Y("time_step_sec:Q", title="Length of time step (s)"), + # Create all 14 plots following the original order + if "time_step_sec" in plot_df.columns: + plots.append( + create_line_chart( + "time_step_sec", "Length of Time Step", "Length of time step (s)" + ) ) - .properties(title="Time Step", width=300, height=150) - ) - plots.append(timestep_plot) - - # Cell volume plot - volume_plot = ( - alt.Chart(df.to_pandas()) - .mark_line() - .encode( - x=alt.X("Time (min):Q", title="Time (min)"), - y=alt.Y("Cell Volume (L):Q", title="Cell volume (L)"), + + if "cell_volume" in plot_df.columns: + plots.append(create_line_chart("cell_volume", "Cell Volume", "Cell volume (L)")) + + if "total_ribosome_counts" in plot_df.columns: + plots.append( + create_line_chart( + "total_ribosome_counts", "Total Ribosome Count", "Total ribosome count" + ) ) - .properties(title="Cell Volume", width=300, height=150) - ) - plots.append(volume_plot) - - # Total ribosome counts - total_counts_plot = ( - alt.Chart(df.to_pandas()) - .mark_line() - .encode( - x=alt.X("Time (min):Q", title="Time (min)"), - y=alt.Y("total_ribosome_counts:Q", title="Total ribosome count"), + + if "total_ribosome_concentration_mM" in plot_df.columns: + plots.append( + create_line_chart( + "total_ribosome_concentration_mM", + "Total Ribosome Concentration", + "[Total ribosome] (mM)", + ) ) - .properties(title="Total Ribosome Count", width=300, height=150) - ) - plots.append(total_counts_plot) - - # Total ribosome concentration - total_conc_plot = ( - alt.Chart(df.to_pandas()) - .mark_line() - .encode( - x=alt.X("Time (min):Q", title="Time (min)"), - y=alt.Y("total_ribosome_concentration_mM:Q", title="[Total ribosome] (mM)"), + + if "active_ribosome_counts" in plot_df.columns: + plots.append( + create_line_chart( + "active_ribosome_counts", + "Active Ribosome Count", + "Active ribosome count", + skip_first_point=True, + ) ) - .properties(title="Total Ribosome Concentration", width=300, height=150) - ) - plots.append(total_conc_plot) - - # Active ribosome counts - active_counts_plot = ( - alt.Chart(df.to_pandas()) - .mark_line() - .encode( - x=alt.X("Time (min):Q", title="Time (min)"), - y=alt.Y("active_ribosome_counts:Q", title="Active ribosome count"), + + if "active_ribosome_concentration_mM" in plot_df.columns: + plots.append( + create_line_chart( + "active_ribosome_concentration_mM", + "Active Ribosome Concentration", + "[Active ribosome] (mM)", + skip_first_point=True, + ) ) - .properties(title="Active Ribosome Count", width=300, height=150) - ) - plots.append(active_counts_plot) - - # Active ribosome concentration - active_conc_plot = ( - alt.Chart(df.to_pandas()) - .mark_line() - .encode( - x=alt.X("Time (min):Q", title="Time (min)"), - y=alt.Y( - "active_ribosome_concentration_mM:Q", title="[Active ribosome] (mM)" - ), + + if "molar_fraction_active" in plot_df.columns: + plots.append( + create_line_chart( + "molar_fraction_active", + "Molar Fraction Active Ribosomes", + "Molar fraction active ribosomes", + skip_first_point=True, + ) ) - .properties(title="Active Ribosome Concentration", width=300, height=150) - ) - plots.append(active_conc_plot) - - # Molar fraction active - molar_fraction_plot = ( - alt.Chart(df.to_pandas()) - .mark_line() - .encode( - x=alt.X("Time (min):Q", title="Time (min)"), - y=alt.Y("molar_fraction_active:Q", title="Molar fraction active ribosomes"), + + if "mass_fraction_active" in plot_df.columns: + plots.append( + create_line_chart( + "mass_fraction_active", + "Mass Fraction Active Ribosomes", + "Mass fraction active ribosomes", + skip_first_point=True, + ) ) - .properties(title="Molar Fraction Active Ribosomes", width=300, height=150) - ) - plots.append(molar_fraction_plot) - - # Mass fraction active - mass_fraction_plot = ( - alt.Chart(df.to_pandas()) - .mark_line() - .encode( - x=alt.X("Time (min):Q", title="Time (min)"), - y=alt.Y("mass_fraction_active:Q", title="Mass fraction active ribosomes"), + + if "listeners__ribosome_data__did_initialize" in plot_df.columns: + plots.append( + create_line_chart( + "listeners__ribosome_data__did_initialize", + "Ribosome Activations", + "Activations per timestep", + ) ) - .properties(title="Mass Fraction Active Ribosomes", width=300, height=150) - ) - plots.append(mass_fraction_plot) - - # Activations per timestep - activations_plot = ( - alt.Chart(df.to_pandas()) - .mark_line() - .encode( - x=alt.X("Time (min):Q", title="Time (min)"), - y=alt.Y( - "listeners__ribosome_data__did_initialize:Q", - title="Activations per timestep", - ), + + if "listeners__ribosome_data__did_terminate" in plot_df.columns: + plots.append( + create_line_chart( + "listeners__ribosome_data__did_terminate", + "Ribosome Deactivations", + "Deactivations per timestep", + ) ) - .properties(title="Activations per Timestep", width=300, height=150) - ) - plots.append(activations_plot) - - # Deactivations per timestep - deactivations_plot = ( - alt.Chart(df.to_pandas()) - .mark_line() - .encode( - x=alt.X("Time (min):Q", title="Time (min)"), - y=alt.Y( - "listeners__ribosome_data__did_terminate:Q", - title="Deactivations per timestep", - ), + + if "activations_per_volume" in plot_df.columns: + plots.append( + create_line_chart( + "activations_per_volume", + "Activations per Volume (fL)", + "Activations per Volume (fL)", + ) ) - .properties(title="Deactivations per Timestep", width=300, height=150) - ) - plots.append(deactivations_plot) - - # Activations per time*volume - activations_tv_plot = ( - alt.Chart(df.to_pandas()) - .mark_line() - .encode( - x=alt.X("Time (min):Q", title="Time (min)"), - y=alt.Y( - "activations_per_time_volume:Q", title="Activations per time*volume" - ), + + if "deactivations_per_volume" in plot_df.columns: + plots.append( + create_line_chart( + "deactivations_per_volume", + "Deactivations per Volume (fL)", + "Deactivations per Volume (fL)", + ) ) - .properties(title="Activations per Time*Volume", width=300, height=150) - ) - plots.append(activations_tv_plot) - - # Deactivations per time*volume - deactivations_tv_plot = ( - alt.Chart(df.to_pandas()) - .mark_line() - .encode( - x=alt.X("Time (min):Q", title="Time (min)"), - y=alt.Y( - "deactivations_per_time_volume:Q", title="Deactivations per time*volume" - ), + + if "listeners__ribosome_data__actual_elongations" in plot_df.columns: + plots.append( + create_line_chart( + "listeners__ribosome_data__actual_elongations", + "Amino Acids Translated", + "AA translated", + ) ) - .properties(title="Deactivations per Time*Volume", width=300, height=150) - ) - plots.append(deactivations_tv_plot) - - # AA translated - aa_plot = ( - alt.Chart(df.to_pandas()) - .mark_line() - .encode( - x=alt.X("Time (min):Q", title="Time (min)"), - y=alt.Y( - "listeners__ribosome_data__actual_elongations:Q", title="AA translated" - ), + + if "listeners__ribosome_data__effective_elongation_rate" in plot_df.columns: + plots.append( + create_line_chart( + "listeners__ribosome_data__effective_elongation_rate", + "Effective Ribosome Elongation Rate", + "Effective elongation rate", + ) ) - .properties(title="Amino Acids Translated", width=300, height=150) - ) - plots.append(aa_plot) - - # Effective elongation rate - elongation_plot = ( - alt.Chart(df.to_pandas()) - .mark_line() - .encode( - x=alt.X("Time (min):Q", title="Time (min)"), - y=alt.Y( - "listeners__ribosome_data__effective_elongation_rate:Q", - title="Effective elongation rate", - ), + + if not plots: + fallback_df = pl.DataFrame( + { + "message": ["No data available for ribosome usage visualization"], + "x": [0], + "y": [0], + } + ) + fallback_plot = ( + alt.Chart(fallback_df.to_pandas()) + .mark_text(size=20, color="red") + .encode(x="x:Q", y="y:Q", text="message:N") + .properties( + width=600, + height=400, + title="Ribosome Usage Statistics - No Data Available", + ) + ) + plots.append(fallback_plot) + + # Arrange plots in 2 columns as in original + left_plots = plots[::2] # Even indices (0, 2, 4, ...) + right_plots = plots[1::2] # Odd indices (1, 3, 5, ...) + + # Ensure both columns have same length by adding empty chart if needed + if len(left_plots) > len(right_plots): + empty_chart = ( + alt.Chart(pl.DataFrame({"x": [0], "y": [0]}).to_pandas()) + .mark_point(opacity=0) + .encode(x="x:Q", y="y:Q") + .properties(width=600, height=120) + ) + right_plots.append(empty_chart) + elif len(right_plots) > len(left_plots): + empty_chart = ( + alt.Chart(pl.DataFrame({"x": [0], "y": [0]}).to_pandas()) + .mark_point(opacity=0) + .encode(x="x:Q", y="y:Q") + .properties(width=600, height=120) ) - .properties(title="Effective Elongation Rate", width=300, height=150) + left_plots.append(empty_chart) + + # Create two column layout + left_column = alt.vconcat(*left_plots) + right_column = alt.vconcat(*right_plots) + combined_plot = ( + alt.hconcat(left_column, right_column) + .resolve_scale(x="shared", y="independent") + .properties(title="Ribosome Usage Statistics") ) - plots.append(elongation_plot) - # Combine all plots in a grid layout (7 rows, 2 columns) - left_column = alt.vconcat(*plots[:7]) - right_column = alt.vconcat(*plots[7:]) - combined_plot = alt.hconcat(left_column, right_column) + output_path = os.path.join(outdir, "ribosome_usage_report.html") + combined_plot.save(output_path) + print(f"Saved visualization to: {output_path}") - # Save the plot - combined_plot.save(os.path.join(outdir, "ribosome_usage.html")) + return combined_plot diff --git a/ecoli/analysis/multigeneration/ribosome_components.py b/ecoli/analysis/multigeneration/ribosome_components.py index ee93d02b4..420106d00 100644 --- a/ecoli/analysis/multigeneration/ribosome_components.py +++ b/ecoli/analysis/multigeneration/ribosome_components.py @@ -1,3 +1,7 @@ +""" +Record the 30S and 50S component count vs time +""" + import altair as alt import os from typing import Any, Dict @@ -13,6 +17,8 @@ read_stacked_columns, ) +# ----------------------------------------- # + def plot( params: Dict[str, Any], @@ -115,7 +121,8 @@ def plot( s50_total=s50_complex + active_ribo, ) - # Prepare data for plotting by melting into long format + # ----------------------------------------- # + plot_cols_30 = ["s30_limiting", "s30_16s_total", "s30_total"] plot_cols_50 = ["s50_limiting", "s50_23s_total", "s50_5s_total", "s50_total"] @@ -126,7 +133,6 @@ def plot( id_vars="Time_min", variable_name="component", value_name="count" ) - # Create 30S components chart with legend chart_30 = ( alt.Chart(melt_30) .mark_line() @@ -138,7 +144,6 @@ def plot( .properties(title="30S Component Counts", width=600) ) - # Create 50S components chart with legend chart_50 = ( alt.Chart(melt_50) .mark_line() @@ -150,7 +155,6 @@ def plot( .properties(title="50S Component Counts", width=600) ) - # Combine and save charts combined = ( alt.vconcat(chart_30, chart_50) .resolve_scale(color="independent") diff --git a/ecoli/analysis/multigeneration/ribosome_crowding.py b/ecoli/analysis/multigeneration/ribosome_crowding.py index 08d88ec8b..559ae95c8 100644 --- a/ecoli/analysis/multigeneration/ribosome_crowding.py +++ b/ecoli/analysis/multigeneration/ribosome_crowding.py @@ -1,21 +1,26 @@ -import os -import pickle +""" +Record the translation probability comparison on Gene EG10184 +""" +import altair as alt +import os from typing import Any -import altair as alt +from duckdb import DuckDBPyConnection +import pickle import polars as pl import numpy as np -from duckdb import DuckDBPyConnection from ecoli.library.parquet_emitter import ( - open_arbitrary_sim_data, field_metadata, + open_arbitrary_sim_data, named_idx, read_stacked_columns, ) -# Maximum number of overcrowded proteins to plot +# ----------------------------------------- # + +# Set this to ensure maximum figure size is not exceeded MAX_NUMBER_OF_MONOMERS_TO_PLOT = 300 @@ -32,224 +37,283 @@ def plot( variant_names: dict[str, str], ): """ - Compare target vs actual translation probabilities for mRNAs - whose translation probabilities were limited by ribosome crowding. + Comparison of target translation probabilities vs actual translation + probabilities for mRNAs whose translation probabilities exceeded the limit + set by the physical size and the elongation rates of ribosomes. """ - # 1. Load sim_data from arbitrary source (as in new_gene example) + # Load sim_data to get monomer information with open_arbitrary_sim_data(sim_data_dict) as f: sim_data = pickle.load(f) - # 2. From sim_data, get monomer IDs and mappings to mRNA/gene - mRNA_sim_array = sim_data.process.transcription.cistron_data.struct_array - monomer_sim_array = sim_data.process.translation.monomer_data.struct_array - monomer_ids: list[str] = monomer_sim_array["id"].tolist() + # Get monomer IDs and mappings + mRNA_sim_data = sim_data.process.transcription.cistron_data.struct_array + monomer_sim_data = sim_data.process.translation.monomer_data.struct_array + monomer_ids = monomer_sim_data["id"].tolist() - # Build mapping: monomer_id -> mRNA_id -> gene_id - monomer_to_mRNA: dict[str, str] = dict( - zip(monomer_sim_array["id"], monomer_sim_array["cistron_id"]) - ) - mRNA_to_gene: dict[str, str] = dict( - zip(mRNA_sim_array["id"], mRNA_sim_array["gene_id"]) + # Build mappings: monomer_id -> mRNA_id -> gene_id + monomer_to_mRNA_id_dict = dict( + zip(monomer_sim_data["id"], monomer_sim_data["cistron_id"]) ) + mRNA_to_gene_id_dict = dict(zip(mRNA_sim_data["id"], mRNA_sim_data["gene_id"])) - # 3. Determine listener names / field names in vEcoli for target & actual probabilities. - # You need to replace the placeholders below with the real listener/field names - # in your DuckDB schema / parquet emitter config. - # Common pattern: listener name might be "ribosome_data" and fields like - # "target_prob_translation_per_transcript" and "actual_prob_translation_per_transcript". - # - # For example, if in config_sql you have a listener named "ribosome_data" and - # parquet columns named "target_prob_translation_per_transcript_", - # then you may use field_metadata to get the list of column names and named_idx to select indices. - # - # 3.a. Fetch all column names for target probabilities - # TODO: replace "listeners__ribosome_data__target_prob_translation_per_transcript" - # with the actual listener name used in your vEcoli config. + # Get field metadata for ribosome data try: - target_columns: list[str] = field_metadata( + target_field_names = field_metadata( conn, config_sql, "listeners__ribosome_data__target_prob_translation_per_transcript", ) - actual_columns: list[str] = field_metadata( + actual_field_names = field_metadata( conn, config_sql, "listeners__ribosome_data__actual_prob_translation_per_transcript", ) - except Exception: - # If the above naming is incorrect, adjust to your listener naming convention. - raise RuntimeError( - "Failed to fetch field metadata for ribosome data. " - "Please replace the listener names in field_metadata(...) with your actual names." - ) + except Exception as e: + print(f"Error getting field metadata: {e}") + print("Trying alternative listener names...") + try: + target_field_names = field_metadata( + conn, config_sql, "listeners__ribosome_data" + ) + actual_field_names = target_field_names # Assume same structure + except Exception as e2: + print(f"Alternative approach also failed: {e2}") + return - # 3.b. Build index dicts: column name -> index in the wide array - # We assume that field_metadata returns a list of column names in the same order - # as the underlying array dimension for translation probabilities. - target_idx_dict = {col: i for i, col in enumerate(target_columns)} - actual_idx_dict = {col: i for i, col in enumerate(actual_columns)} - - # 3.c. Determine indexes for all monomer_ids in these columns - # Here we assume that column names in target_columns/actual_columns directly match monomer_ids. - # If not, adjust the mapping logic accordingly. - target_indexes: list[int] = [] - actual_indexes: list[int] = [] - missing_target = [] - missing_actual = [] - for mon in monomer_ids: - if mon in target_idx_dict: - target_indexes.append(target_idx_dict[mon]) - else: - missing_target.append(mon) - if mon in actual_idx_dict: - actual_indexes.append(actual_idx_dict[mon]) - else: - missing_actual.append(mon) - if missing_target or missing_actual: - # Warn user that some monomers are not present in the listener fields - print( - f"Warning: some monomer IDs not found in target/actual fields. " - f"Missing in target: {missing_target[:5]}{'...' if len(missing_target) > 5 else ''}; " - f"Missing in actual: {missing_actual[:5]}{'...' if len(missing_actual) > 5 else ''}." - ) - # Continue with intersection of available monomers - # Use only those present in both - # Find intersection in order of monomer_ids: - valid_monomer_ids = [ - mon for mon in monomer_ids if mon in target_idx_dict and mon in actual_idx_dict - ] - valid_target_indexes = [target_idx_dict[mon] for mon in valid_monomer_ids] - valid_actual_indexes = [actual_idx_dict[mon] for mon in valid_monomer_ids] + # Find indices for each monomer in the field metadata + target_monomer_indices = [] + actual_monomer_indices = [] + valid_monomer_ids = [] + + for i, monomer_id in enumerate(monomer_ids): + if monomer_id in target_field_names and monomer_id in actual_field_names: + target_idx = target_field_names.index(monomer_id) + actual_idx = actual_field_names.index(monomer_id) + target_monomer_indices.append(target_idx) + actual_monomer_indices.append(actual_idx) + valid_monomer_ids.append(monomer_id) if not valid_monomer_ids: - print( - "No overlapping monomer IDs found in ribosome_data listeners; aborting plot." - ) + print("No valid monomer IDs found in ribosome data fields.") return - # 4. Read stacked columns: time + target + actual arrays. - # First read target data: + print(f"[INFO] Found {len(valid_monomer_ids)} valid monomer IDs") + + # Create named indices for data reading target_named = named_idx( - "listenersribosome_data_target_prob_translation_per_transcript", + "listeners__ribosome_data__target_prob_translation_per_transcript", valid_monomer_ids, - valid_target_indexes, + [target_monomer_indices], ) - # Then read actual data: actual_named = named_idx( - "listenersribosome_data_actual_prob_translation_per_transcript", + "listeners__ribosome_data__actual_prob_translation_per_transcript", valid_monomer_ids, - valid_actual_indexes, + [actual_monomer_indices], ) - # Note: 上面 named_idx 的第一个参数需要替换为你项目中实际的 listener 名称前缀,例如 - # "listeners__ribosome_data__target_prob_translation_per_transcript" 或类似,确保与 field_metadata(...) 中使用的 listener 匹配。 - - # Read time + these columns. read_stacked_columns 返回 dict-like,包含 "time" 字段和各 monomer 列。 - # 如果 time 字段命名不是 "time",请调整。 - target_data = read_stacked_columns(history_sql, [target_named], conn=conn) - actual_data = read_stacked_columns(history_sql, [actual_named], conn=conn) - - # Convert to Polars DataFrame - df_target = pl.DataFrame(target_data) - df_actual = pl.DataFrame(actual_data) - - # Assume both have a "time" column; drop duplicate time in actual - if "time" in df_actual.columns: - df_actual = df_actual.drop("time") - - # 5. Rename columns to distinguish target vs actual - # e.g., columns are monomer IDs; 重命名为 target_ / actual_ - rename_target = {mon: f"target_{mon}" for mon in valid_monomer_ids} - rename_actual = {mon: f"actual_{mon}" for mon in valid_monomer_ids} - df_target = df_target.rename(rename_target) - df_actual = df_actual.rename(rename_actual) - - # Merge horizontally on row order (time) - df = pl.concat([df_target, df_actual], how="horizontal") - # Create Time (min) column - if "time" in df.columns: - df = df.with_columns((pl.col("time") / 60).alias("Time (min)")) - else: - raise RuntimeError("No 'time' column found in ribosome data readout.") - - # Compute overcrowded monomer indices: where max(target - actual) > 0 - # We'll convert to NumPy for efficient max along time axis. - # Build numpy arrays in matching order - # n = len(valid_monomer_ids) - # Stack arrays: shape (T, n) - target_matrix = np.vstack( - [df[f"target_{mon}"].to_numpy() for mon in valid_monomer_ids] - ).T - actual_matrix = np.vstack( - [df[f"actual_{mon}"].to_numpy() for mon in valid_monomer_ids] - ).T - diff = target_matrix - actual_matrix - # max over time for each monomer - max_diff = diff.max(axis=0) - overcrowded_mask = max_diff > 0 - overcrowded_indices = np.where(overcrowded_mask)[0].tolist() - n_overcrowded = len(overcrowded_indices) - - if n_overcrowded == 0: - print("No overcrowded mRNAs detected in this simulation; nothing to plot.") - return - # Limit number to plot - n_to_plot = min(n_overcrowded, MAX_NUMBER_OF_MONOMERS_TO_PLOT) - if n_overcrowded > MAX_NUMBER_OF_MONOMERS_TO_PLOT: + # Read target and actual data separately to ensure proper structure + try: + # Read target data + target_data = read_stacked_columns( + history_sql, + [target_named], + conn=conn, + ) + target_df = pl.DataFrame(target_data).with_columns( + **{"Time (min)": pl.col("time") / 60} + ) + + # Read actual data + actual_data = read_stacked_columns( + history_sql, + [actual_named], + conn=conn, + ) + actual_df = pl.DataFrame(actual_data).with_columns( + **{"Time (min)": pl.col("time") / 60} + ) + + # Get the probability columns + target_prob_cols = [ + col for col in target_df.columns if col in valid_monomer_ids + ] + actual_prob_cols = [ + col for col in actual_df.columns if col in valid_monomer_ids + ] + + if not target_prob_cols or not actual_prob_cols: + print("Could not find probability columns in datasets") + return + + # Create arrays for calculation + target_prob_array = target_df.select(target_prob_cols).to_numpy() + actual_prob_array = actual_df.select(actual_prob_cols).to_numpy() + time_min = target_df["Time (min)"].to_numpy() + + print("[INFO] Successfully read target and actual data") print( - f"Total overcrowded proteins: {n_overcrowded}. " - f"Plotting first {MAX_NUMBER_OF_MONOMERS_TO_PLOT} only." + f"[INFO] Target shape: {target_prob_array.shape}, Actual shape: {actual_prob_array.shape}" ) - # For each overcrowded monomer, get gene ID and build a small Altair chart. - charts = [] - for idx_in_list in overcrowded_indices[:n_to_plot]: - monomer = valid_monomer_ids[idx_in_list] - gene = mRNA_to_gene.get(monomer_to_mRNA.get(monomer, ""), "unknown") - # Build a pandas DataFrame for plotting: columns Time (min), target, actual - # Use Polars to pandas conversion for this single monomer: - pd_df = ( - df.select(["Time (min)", f"target_{monomer}", f"actual_{monomer}"]) - .rename( + except Exception as e: + print(f"Failed to read separate datasets: {e}") + return + + # Calculate differences to find overcrowded mRNAs + prob_differences = target_prob_array - actual_prob_array + overcrowded_monomer_indexes = np.where(prob_differences.max(axis=0) > 0)[0] + n_overcrowded_monomers = len(overcrowded_monomer_indexes) + + print(f"[INFO] Found {n_overcrowded_monomers} overcrowded monomers") + + if n_overcrowded_monomers == 0: + print("No overcrowded mRNAs found in the simulation.") + return + + # Get gene IDs for overcrowded monomers + overcrowded_monomer_ids = [ + valid_monomer_ids[i] for i in overcrowded_monomer_indexes + ] + overcrowded_gene_ids = [ + mRNA_to_gene_id_dict.get(monomer_to_mRNA_id_dict.get(monomer_id), "unknown") + for monomer_id in overcrowded_monomer_ids + ] + + n_overcrowded_monomers_to_plot = min( + n_overcrowded_monomers, MAX_NUMBER_OF_MONOMERS_TO_PLOT + ) + + # ----------------------------------------- # + + plot_data = [] + for i, monomer_index in enumerate(overcrowded_monomer_indexes): + if i >= MAX_NUMBER_OF_MONOMERS_TO_PLOT: + break + + gene_id = overcrowded_gene_ids[i] + + # Get the data for this monomer + target_probs = target_prob_array[:, monomer_index] + actual_probs = actual_prob_array[:, monomer_index] + + # Add target probabilities + for j, time_val in enumerate(time_min): + plot_data.append( { - "Time (min)": "Time (min)", - f"target_{monomer}": "target", - f"actual_{monomer}": "actual", + "Time_min": float(time_val), + "Gene_ID": str(gene_id), + "Probability_Type": "target", + "Translation_Probability": float(target_probs[j]), + "Plot_Order": i, } ) - .to_pandas() - ) - # Melt to long form - pd_long = pd_df.melt( - id_vars=["Time (min)"], - value_vars=["target", "actual"], - var_name="Type", - value_name="Probability", - ) - # Create line chart + + # Add actual probabilities + for j, time_val in enumerate(time_min): + plot_data.append( + { + "Time_min": float(time_val), + "Gene_ID": str(gene_id), + "Probability_Type": "actual", + "Translation_Probability": float(actual_probs[j]), + "Plot_Order": i, + } + ) + + if not plot_data: + print("No data prepared for plotting") + return + + plot_df = pl.DataFrame(plot_data) + + # Create individual plots for each overcrowded gene + charts = [] + for i in range(n_overcrowded_monomers_to_plot): + gene_data = plot_df.filter(pl.col("Plot_Order") == i) + + if gene_data.height == 0: + continue + + gene_id = gene_data["Gene_ID"][0] + + # Create chart with simplified encoding and proper tooltip chart = ( - alt.Chart(pd_long) - .mark_line() + alt.Chart(gene_data) + .mark_line(point=False, strokeWidth=2) .encode( - x=alt.X("Time (min)", title="Time (min)"), - y=alt.Y("Probability", title="Translation Probability"), - color="Type", + x=alt.X("Time_min:Q", title="Time (min)", scale=alt.Scale(nice=True)), + y=alt.Y( + "Translation_Probability:Q", + title=f"{gene_id} translation probability", + scale=alt.Scale(nice=True), + ), + color=alt.Color( + "Probability_Type:N", + scale=alt.Scale( + domain=["target", "actual"], range=["#1f77b4", "#ff7f0e"] + ), + legend=alt.Legend(title="Type") if i == 0 else None, + ), + strokeDash=alt.StrokeDash( + "Probability_Type:N", + scale=alt.Scale( + domain=["target", "actual"], range=[[1, 0], [5, 5]] + ), + ), + tooltip=[ + alt.Tooltip("Time_min:Q", title="Time (min)", format=".2f"), + alt.Tooltip( + "Translation_Probability:Q", title="Probability", format=".4f" + ), + alt.Tooltip("Probability_Type:N", title="Type"), + alt.Tooltip("Gene_ID:N", title="Gene"), + ], + ) + .properties( + width=600, + height=150, + title=alt.TitleParams( + text=[ + f"Gene {gene_id} - Translation Probability Comparison", + f"Total overcrowded proteins: {n_overcrowded_monomers}" + + ( + f" (showing first {MAX_NUMBER_OF_MONOMERS_TO_PLOT})" + if n_overcrowded_monomers > MAX_NUMBER_OF_MONOMERS_TO_PLOT + else "" + ) + if i == 0 + else "", + ], + fontSize=12, + anchor="start", + ), ) - .properties(title=f"{gene} (monomer {monomer})") - .interactive() ) + charts.append(chart) - # Vertically concatenate all charts - combined = ( - alt.vconcat(*charts) - .configure_axis(labelFontSize=10, titleFontSize=12) - .configure_title(fontSize=14) - ) + if charts: + combined_chart = ( + alt.vconcat(*charts) + .resolve_scale(color="independent") + .add_params(alt.selection_interval(bind="scales")) + ) + + alt.data_transformers.enable("json") - # Save to HTML - os.makedirs(outdir, exist_ok=True) - outpath = os.path.join(outdir, "ribosome_crowding.html") - combined.save(outpath) - print(f"Saved ribosome crowding plot to {outpath}") + output_path = os.path.join(outdir, "ribosome_crowding.html") + combined_chart.save(output_path) + + print( + f"[INFO] Generated ribosome crowding plot for {len(charts)} overcrowded proteins" + ) + print(f"[INFO] Plot saved to: {output_path}") + + # Also save as JSON for debugging if needed + json_path = os.path.join(outdir, "ribosome_crowding.json") + combined_chart.save(json_path) + print(f"[INFO] Chart specification saved to: {json_path}") + + else: + print("[INFO] No charts created - no data to plot") From 1e32d84f72969990aa1e17635f0d2c199260d28e Mon Sep 17 00:00:00 2001 From: Sean Cheah Date: Sat, 28 Jun 2025 16:04:34 -0700 Subject: [PATCH 17/71] Explicilty finalize Parquet emitter instead of relying on atexit hook Exceptions in atexit hooks are ignored and not reflected in the final exit code. Additionally, new futures cannot be scheduled in atexit hooks (after interpreter shutdown) so asyncio does not work. --- ecoli/experiments/ecoli_master_sim.py | 92 ++++++++++----------- ecoli/library/parquet_emitter.py | 110 ++++++++++++-------------- ecoli/library/test_parquet_emitter.py | 15 ++-- ecoli/processes/engine_process.py | 14 +++- 4 files changed, 105 insertions(+), 126 deletions(-) diff --git a/ecoli/experiments/ecoli_master_sim.py b/ecoli/experiments/ecoli_master_sim.py index 50931150c..3a268d16a 100644 --- a/ecoli/experiments/ecoli_master_sim.py +++ b/ecoli/experiments/ecoli_master_sim.py @@ -732,7 +732,43 @@ def build_ecoli(self): self.generated_initial_state, initial_environment ) - def save_states(self, daughter_outdir: str = ""): + def update_experiment(self, time_to_update: float = 0.0): + """ + Runs the E. coli simulation for a specified amount of time. If the + simulation reaches a division event and ``config['generations']`` is set, + it will save the daughter cell states to JSON files in the directory + specified by ``config['daughter_outdir']``. If the simulation reaches + the maximum duration specified by ``config['max_duration']``, it will + raise a :py:class:`~ecoli.experiments.ecoli_master_sim.TimeLimitError` + if ``config['fail_at_max_duration']`` is ``True``. + """ + try: + self.ecoli_experiment.update(time_to_update) + except DivisionDetected: + state = self.ecoli_experiment.state.get_value(condition=not_a_process) + assert len(state["agents"]) == 2 + for i, agent_state in enumerate(state["agents"].values()): + prepare_save_state(agent_state) + daughter_path = os.path.join( + self.daughter_outdir, f"daughter_state_{i}.json" + ) + write_json(daughter_path, agent_state) + print( + f"Divided at t = {self.ecoli_experiment.global_time} after" + f"{self.ecoli_experiment.global_time - self.initial_global_time} sec." + ) + with open("division_time.sh", "w") as f: + f.write(f"export division_time={self.ecoli_experiment.global_time}") + # Tell Parquet emitter that simulation was successful + if isinstance(self.ecoli_experiment.emitter, ParquetEmitter): + self.ecoli_experiment.emitter.success = True + self.ecoli_experiment.emitter.finalize() + sys.exit() + finally: + if isinstance(self.ecoli_experiment.emitter, ParquetEmitter): + self.ecoli_experiment.emitter.finalize() + + def save_states(self): """ Runs the simulation while saving the states of specific timesteps to files named ``data/vivecoli_t{time}.json``. Invoked by @@ -740,12 +776,6 @@ def save_states(self, daughter_outdir: str = ""): if ``config['save'] == True``. State is saved as a JSON that can be reloaded into a simulation as described in :py:meth:`~ecoli.composites.ecoli_master.Ecoli.initial_state`. - - Args: - daughter_outdir: Location to write JSON files for daughter cell(s). - Only used if ``config`` contains ``generations`` key specifying - number of generations to simulate. Nextflow chains simulations - together by passing saved daughter states to new processes. """ for time in self.save_times: if time > self.max_duration: @@ -759,27 +789,7 @@ def save_states(self, daughter_outdir: str = ""): time_to_next_save = self.save_times[i] else: time_to_next_save = self.save_times[i] - self.save_times[i - 1] - try: - self.ecoli_experiment.update(time_to_next_save) - except DivisionDetected: - state = self.ecoli_experiment.state.get_value(condition=not_a_process) - assert len(state["agents"]) == 2 - for i, agent_state in enumerate(state["agents"].values()): - prepare_save_state(agent_state) - daughter_path = os.path.join( - daughter_outdir, f"daughter_state_{i}.json" - ) - write_json(daughter_path, agent_state) - print( - f"Divided at t = {self.ecoli_experiment.global_time} after" - f"{self.ecoli_experiment.global_time - self.initial_global_time} sec." - ) - with open("division_time.sh", "w") as f: - f.write(f"export division_time={self.ecoli_experiment.global_time}") - # Tell Parquet emitter that simulation was successful - if isinstance(self.ecoli_experiment.emitter, ParquetEmitter): - self.ecoli_experiment.emitter.success = True - sys.exit() + self.update_experiment(time_to_next_save) time_elapsed = self.save_times[i] state = self.ecoli_experiment.state.get_value(condition=not_a_process) if self.divide: @@ -791,7 +801,7 @@ def save_states(self, daughter_outdir: str = ""): print("Finished saving the state at t = " + str(time_elapsed)) time_remaining = self.max_duration - self.save_times[-1] if time_remaining: - self.ecoli_experiment.update(time_remaining) + self.update_experiment(time_remaining) def run(self): """Create and run an EcoliSim experiment. @@ -890,29 +900,9 @@ def run(self): # run the experiment if self.save: - self.save_states(self.daughter_outdir) + self.save_states() else: - try: - self.ecoli_experiment.update(self.max_duration) - except DivisionDetected: - state = self.ecoli_experiment.state.get_value(condition=not_a_process) - assert len(state["agents"]) == 2 - for i, agent_state in enumerate(state["agents"].values()): - prepare_save_state(agent_state) - daughter_path = os.path.join( - self.daughter_outdir, f"daughter_state_{i}.json" - ) - write_json(daughter_path, agent_state) - print( - f"Divided at t = {self.ecoli_experiment.global_time} after" - f"{self.ecoli_experiment.global_time - self.initial_global_time} sec." - ) - with open("division_time.sh", "w") as f: - f.write(f"export division_time={self.ecoli_experiment.global_time}") - # Tell Parquet emitter that simulation was successful - if isinstance(self.ecoli_experiment.emitter, ParquetEmitter): - self.ecoli_experiment.emitter.success = True - sys.exit() + self.update_experiment(self.max_duration) self.ecoli_experiment.end() if self.profile: report_profiling(self.ecoli_experiment.stats) diff --git a/ecoli/library/parquet_emitter.py b/ecoli/library/parquet_emitter.py index 74b93e663..3c1083007 100644 --- a/ecoli/library/parquet_emitter.py +++ b/ecoli/library/parquet_emitter.py @@ -1,6 +1,5 @@ import atexit import os -import sys from concurrent.futures import Future, ThreadPoolExecutor from typing import Any, Callable, cast, Mapping, Optional from urllib import parse @@ -8,9 +7,9 @@ import duckdb import numpy as np import polars as pl -from pyarrow import fs from polars.datatypes import DataTypeClass from fsspec.core import filesystem, url_to_fs, OpenFile +from fsspec.spec import AbstractFileSystem from tqdm import tqdm from vivarium.core.emitter import Emitter @@ -66,7 +65,7 @@ def json_to_parquet( emit_dict: dict[str, np.ndarray | list[pl.Series]], outfile: str, schema: dict[str, Any], - filesystem: fs.FileSystem, + filesystem: AbstractFileSystem, ): """Convert dictionary to Parquet. @@ -75,7 +74,7 @@ def json_to_parquet( or lists of Polars Series (variable-shape). outfile: Path to output Parquet file. Can be local path or URI. schema: Full mapping of column names to Polars dtypes. - filesystem: On local filesystem, PyArrow filesystem needed to + filesystem: On local filesystem, fsspec filesystem needed to write Parquet file atomically. """ tbl = pl.DataFrame(emit_dict, schema={k: schema[k] for k in emit_dict}) @@ -83,13 +82,11 @@ def json_to_parquet( # trying to read partially written Parquet files. Get around this by writing # to a temporary file and then renaming it to the final output file. temp_outfile = outfile - parsed_outfile = parse.urlparse(outfile) - if parsed_outfile.scheme in ("", "file", "local"): - outfile = os.path.join(parsed_outfile.netloc, parsed_outfile.path) + if parse.urlparse(outfile).scheme in ("", "file", "local"): temp_outfile = outfile + ".tmp" tbl.write_parquet(temp_outfile, statistics=False) if temp_outfile != outfile: - filesystem.move(temp_outfile, outfile) + filesystem.mv(temp_outfile, outfile) def union_by_name(query_sql: str) -> str: @@ -813,7 +810,12 @@ def submit(self, fn: Callable, *args, **kwargs) -> Future: class ParquetEmitter(Emitter): """ - Emit data to a Parquet dataset. + Emit data to a Parquet dataset. Note that :py:meth:`~.finalize` + must be explicitly called in a ``try...finally`` block around the call to + :py:meth:`vivarium.core.engine.Engine.update` to ensure that all buffered + emits are written to Parquet files when the simulation ends for any reason. + This is handled automatically in :py:class:`~ecoli.experiments.ecoli_master_sim.EcoliSim` + and :py:class:`~ecoli.processes.engine_process.EngineProcess` """ def __init__(self, config: dict[str, Any]) -> None: @@ -838,7 +840,8 @@ def __init__(self, config: dict[str, Any]) -> None: self.out_uri = os.path.abspath(config["out_dir"]) else: self.out_uri = config["out_uri"] - self.filesystem, self.out_dir = fs.FileSystem.from_uri(self.out_uri) + self.filesystem: AbstractFileSystem + self.filesystem, _ = url_to_fs(self.out_uri) self.batch_size = config.get("batch_size", 400) self.threaded = config.get("threaded", True) if self.threaded: @@ -860,62 +863,48 @@ def __init__(self, config: dict[str, Any]) -> None: self.last_batch_future.set_result(None) # Set either by EcoliSim or by EngineProcess if sim reaches division self.success = False - atexit.register(self._finalize) - def _finalize(self): + def finalize(self): """Convert remaining batched emits to Parquet at sim shutdown and mark sim as successful if ``success`` flag was set. In vEcoli, this is done by :py:class:`~ecoli.experiments.ecoli_master_sim.EcoliSim` upon reaching division. """ - try: - # Wait for last batch to finish writing - self.last_batch_future.result() - # Flush any remaining buffered emits to Parquet - outfile = os.path.join( + # Wait for last batch to finish writing + self.last_batch_future.result() + # Flush any remaining buffered emits to Parquet + outfile = os.path.join( + self.out_uri, + self.experiment_id, + "history", + self.partitioning_path, + f"{self.num_emits}.pq", + ) + self.filesystem.makedirs(os.path.dirname(outfile), exist_ok=True) + if not self.filesystem.exists(outfile): + for k, v in self.buffered_emits.items(): + self.buffered_emits[k] = v[: self.num_emits % self.batch_size] + json_to_parquet( + self.buffered_emits, outfile, self.pl_types, self.filesystem + ) + # Hive-partitioned directory that only contains successful sims + if self.success: + success_file = os.path.join( self.out_uri, self.experiment_id, - "history", + "success", self.partitioning_path, - f"{self.num_emits}.pq", + "s.pq", ) - # PyArrow filesystem requires path, not URI - self.filesystem.create_dir( - self.out_dir + os.path.dirname(outfile)[len(self.out_uri) :] + try: + self.filesystem.delete(os.path.dirname(success_file), recursive=True) + except (FileNotFoundError, OSError): + pass + self.filesystem.makedirs(os.path.dirname(success_file)) + pl.DataFrame({"success": [True]}).write_parquet( + success_file, + statistics=False, ) - # Write remaining buffered emits - if self.num_emits % self.batch_size != 0: - for k, v in self.buffered_emits.items(): - self.buffered_emits[k] = v[: self.num_emits % self.batch_size] - json_to_parquet( - self.buffered_emits, outfile, self.pl_types, self.filesystem - ) - # Hive-partitioned directory that only contains successful sims - if self.success: - success_file = os.path.join( - self.out_uri, - self.experiment_id, - "success", - self.partitioning_path, - "s.pq", - ) - success_dir = ( - self.out_dir + os.path.dirname(success_file)[len(self.out_uri) :] - ) - try: - self.filesystem.delete_dir(success_dir) - except (FileNotFoundError, OSError): - pass - self.filesystem.create_dir(success_dir) - pl.DataFrame({"success": [True]}).write_parquet( - success_file, - statistics=False, - ) - except Exception as e: - # Since Python ignores exceptions in atexit callbacks, - # we need to explicitly set the exit code - print(f"Error during ParquetEmitter finalization: {e}", file=sys.stderr) - os._exit(1) def emit(self, data: dict[str, Any]): """ @@ -986,14 +975,13 @@ def emit(self, data: dict[str, Any]): self.partitioning_path, "config.pq", ) - config_dir = self.out_dir + os.path.dirname(outfile)[len(self.out_uri) :] # Cleanup any existing output files from previous runs then # create new folder for config / simulation output try: - self.filesystem.delete_dir(config_dir) + self.filesystem.delete(os.path.dirname(outfile), recursive=True) except (FileNotFoundError, OSError): pass - self.filesystem.create_dir(config_dir) + self.filesystem.makedirs(os.path.dirname(outfile)) self.last_batch_future = self.executor.submit( json_to_parquet, config_emit, @@ -1003,13 +991,13 @@ def emit(self, data: dict[str, Any]): ) # Delete any sim output files in final filesystem history_outdir = os.path.join( - self.out_dir, self.experiment_id, "history", self.partitioning_path + self.out_uri, self.experiment_id, "history", self.partitioning_path ) try: - self.filesystem.delete_dir(history_outdir) + self.filesystem.delete(history_outdir, recursive=True) except (FileNotFoundError, OSError): pass - self.filesystem.create_dir(history_outdir) + self.filesystem.makedirs(history_outdir) return # Each Engine that uses this emitter should only simulate a single cell # In lineage simulations, StopAfterDivision Step will terminate diff --git a/ecoli/library/test_parquet_emitter.py b/ecoli/library/test_parquet_emitter.py index ad0debec5..e3d3652df 100644 --- a/ecoli/library/test_parquet_emitter.py +++ b/ecoli/library/test_parquet_emitter.py @@ -292,10 +292,6 @@ def test_initialization(self, temp_dir): emitter.partitioning_path = "path/to/output" assert emitter.out_uri == "gs://bucket/path" assert emitter.batch_size == 100 - # GCSFS uses asyncio and cannot schedule futures after interpreter shutdown - # so _finalize hook with raise an error that is ignored. Here we just - # unregister the hook to avoid cluttering the pytest log - atexit.unregister(emitter._finalize) def test_emit_configuration(self, temp_dir): """Test emitting configuration data.""" @@ -707,7 +703,7 @@ def test_extreme_data_types(self, temp_dir): ) def test_finalize(self, temp_dir): - """Test _finalize method that handles remaining data.""" + """Test finalize method that handles remaining data.""" emitter = ParquetEmitter({"out_dir": temp_dir}) emitter.experiment_id = "test_exp" emitter.partitioning_path = "path/to/output" @@ -729,8 +725,8 @@ def test_finalize(self, temp_dir): with patch( "ecoli.library.parquet_emitter.json_to_parquet" ) as mock_json_to_parquet: - # Test _finalize - emitter._finalize() + # Test finalize + emitter.finalize() # Verify json_to_parquet was called with truncated data mock_json_to_parquet.assert_called_once() @@ -741,7 +737,7 @@ def test_finalize(self, temp_dir): # Test success flag emitter.success = True - emitter._finalize() + emitter.finalize() assert os.path.exists( os.path.join( emitter.out_uri, @@ -939,8 +935,7 @@ def delayed_execution(): # Changed type for field2 to list so should fail with pytest.raises(pl.exceptions.InvalidOperationError): - emitter._finalize() - atexit.unregister(emitter._finalize) + emitter.finalize() # Cleanup the real executor real_executor.shutdown() diff --git a/ecoli/processes/engine_process.py b/ecoli/processes/engine_process.py index df45cbe21..8f9ad2418 100644 --- a/ecoli/processes/engine_process.py +++ b/ecoli/processes/engine_process.py @@ -505,9 +505,15 @@ def next_update(self, timestep, states): self.emitter.emit(emit_config) # Run inner simulation for timestep. - self.sim.run_for(timestep) - if force_complete: - self.sim.complete() + try: + self.sim.run_for(timestep) + if force_complete: + self.sim.complete() + except Exception: + if isinstance(self.emitter, ParquetEmitter): + self.emitter.success = True + self.emitter.finalize() + raise update = {} @@ -520,7 +526,7 @@ def next_update(self, timestep, states): # Finalize emits before division if isinstance(self.emitter, ParquetEmitter): self.emitter.success = True - self.emitter._finalize() + self.emitter.finalize() # Perform division. daughters = [] daughter_states = self.sim.state.divide_value() From 8125da84f61a17240f50dd258fcd8924b3dcb294 Mon Sep 17 00:00:00 2001 From: Sean Cheah Date: Sat, 28 Jun 2025 16:12:55 -0700 Subject: [PATCH 18/71] Remove unused atexit import --- ecoli/library/parquet_emitter.py | 1 - ecoli/library/test_parquet_emitter.py | 1 - 2 files changed, 2 deletions(-) diff --git a/ecoli/library/parquet_emitter.py b/ecoli/library/parquet_emitter.py index 3c1083007..903fd3132 100644 --- a/ecoli/library/parquet_emitter.py +++ b/ecoli/library/parquet_emitter.py @@ -1,4 +1,3 @@ -import atexit import os from concurrent.futures import Future, ThreadPoolExecutor from typing import Any, Callable, cast, Mapping, Optional diff --git a/ecoli/library/test_parquet_emitter.py b/ecoli/library/test_parquet_emitter.py index e3d3652df..670306748 100644 --- a/ecoli/library/test_parquet_emitter.py +++ b/ecoli/library/test_parquet_emitter.py @@ -1,4 +1,3 @@ -import atexit import os import re import tempfile From cd05466d0b2f315429a935f9aabd26e3cfd44c8b Mon Sep 17 00:00:00 2001 From: Shu-Feather <2200017797@stu.pku.edu.cn> Date: Sun, 29 Jun 2025 08:53:11 +0800 Subject: [PATCH 19/71] Add rna_decay_03_high analysis method in multigen --- .../multigeneration/rna_decay_03_high.py | 303 ++++++------------ 1 file changed, 104 insertions(+), 199 deletions(-) diff --git a/ecoli/analysis/multigeneration/rna_decay_03_high.py b/ecoli/analysis/multigeneration/rna_decay_03_high.py index 65c52dde1..4a77d0a9e 100644 --- a/ecoli/analysis/multigeneration/rna_decay_03_high.py +++ b/ecoli/analysis/multigeneration/rna_decay_03_high.py @@ -1,21 +1,6 @@ -""" -Plot dynamic traces of genes with high expression (> 20 counts of mRNA) - -EG10367_RNA 24.8 gapA Glyceraldehyde 3-phosphate dehydrogenase -EG11036_RNA 25.2 tufA Elongation factor Tu -EG50002_RNA 26.2 rpmA 50S Ribosomal subunit protein L27 -EG10671_RNA 30.1 ompF Outer membrane protein F -EG50003_RNA 38.7 acpP Apo-[acyl carrier protein] -EG10669_RNA 41.1 ompA Outer membrane protein A -EG10873_RNA 44.7 rplL 50S Ribosomal subunit protein L7/L12 dimer -EG12179_RNA 46.2 cspE Transcription antiterminator and regulator of RNA stability -EG10321_RNA 53.2 fliC Flagellin -EG10544_RNA 97.5 lpp Murein lipoprotein -""" - import altair as alt import os -from typing import Any, cast +from typing import Any import pickle import polars as pl import numpy as np @@ -41,13 +26,15 @@ def plot( variant_metadata: dict[str, dict[int, Any]], variant_names: dict[str, str], ): - # Load sim_data + # Load sim_data for expected cistron order and degradation rates with open_arbitrary_sim_data(sim_data_dict) as f: sim_data = pickle.load(f) + cistron_array = sim_data.process.transcription.cistron_data.struct_array + all_ids = list(cistron_array["id"]) + deg_rates = {row["id"]: row["deg_rate"] for row in cistron_array} - all_cistron_ids = sim_data.process.transcription.cistron_data["id"].tolist() - - cistron_ids = [ + # Define high-expression cistrons + target_ids = [ "EG10367_RNA", "EG11036_RNA", "EG50002_RNA", @@ -59,198 +46,116 @@ def plot( "EG10321_RNA", "EG10544_RNA", ] + valid_ids = [cid for cid in target_ids if cid in all_ids] + if not valid_ids: + print("[ERROR] No matching cistrons in sim_data") + return - names = [ - "gapA - Glyceraldehyde 3-phosphate dehydrogenase", - "tufA - Elongation factor Tu", - "rpmA - 50S Ribosomal subunit protein L27", - "ompF - Outer membrane protein F", - "acpP - Apo-[acyl carrier protein]", - "ompA - Outer membrane protein A", - "rplL - 50S Ribosomal subunit protein L7/L12 dimer", - "cspE - Transcription antiterminator and regulator of RNA stability", - "fliC - Flagellin", - "lpp - Murein lipoprotein", - ] + # Retrieve metadata for degradation and counts + deg_field = "listeners__rna_degradation_listener__count_RNA_degraded_per_cistron" + cnt_field = "listeners__rna_counts__mRNA_cistron_counts" + try: + deg_meta = field_metadata(conn, config_sql, deg_field) + cnt_meta = field_metadata(conn, config_sql, cnt_field) + except Exception as e: + print(f"[ERROR] field_metadata failed: {e}") + return - cistron_idxs = [all_cistron_ids.index(x) for x in cistron_ids] - deg_rates = sim_data.process.transcription.cistron_data["deg_rate"][cistron_idxs] + # Find indices for valid cistrons + deg_indices = [deg_meta.index(cid) for cid in valid_ids] + cnt_indices = [cnt_meta.index(cid) for cid in valid_ids] - # Get indexes for the specific cistrons we want to track - rna_degradation_idx_dict = { - cistron: i - for i, cistron in enumerate( - field_metadata( - conn, - config_sql, - "listenersrna_degradation__count_RNA_degraded_per_cistron", - ) - ) - } + # Build named_idx structures + deg_named = named_idx(deg_field, valid_ids, [deg_indices]) + cnt_named = named_idx(cnt_field, valid_ids, [cnt_indices]) - rna_counts_idx_dict = { - cistron: i - for i, cistron in enumerate( - field_metadata(conn, config_sql, "listenersrna_counts__mRNA_cistron_counts") + # Read stacked columns + try: + data_dict = read_stacked_columns( + history_sql, + [deg_named, cnt_named], + conn=conn, ) - } - - cistron_degradation_indexes = [ - cast(int, rna_degradation_idx_dict.get(cistron_id)) - for cistron_id in cistron_ids - ] - - cistron_counts_indexes = [ - cast(int, rna_counts_idx_dict.get(cistron_id)) for cistron_id in cistron_ids - ] + except Exception as e: + print(f"[ERROR] read_stacked_columns failed: {e}") + return - # Load data using vEcoli pattern - degradation_columns = named_idx( - "listenersrna_degradation__count_RNA_degraded_per_cistron", - cistron_ids, - cistron_degradation_indexes, + # Convert to Polars DataFrame + df = pl.DataFrame(data_dict) + # Rename time and convert to minutes + if "time" in df.columns: + df = df.with_columns((pl.col("time") / 60).alias("time_min")) + + # Melt degradation and counts + deg_cols = valid_ids + cnt_cols = valid_ids + deg_df = df.select(["time_min"] + deg_cols).melt( + "time_min", variable_name="cistron", value_name="degraded" ) - - counts_columns = named_idx( - "listenersrna_counts__mRNA_cistron_counts", cistron_ids, cistron_counts_indexes + cnt_df = df.select(["time_min"] + cnt_cols).melt( + "time_min", variable_name="cistron", value_name="counts" ) + joined = deg_df.join(cnt_df, on=["time_min", "cistron"]) - # Read data - data = read_stacked_columns( - history_sql, - [degradation_columns, counts_columns, "time", "timeStepSec"], - conn=conn, - ) - - df = pl.DataFrame(data) - - # Convert to numpy arrays for processing (similar to original logic) - N = 100 # smoothing window - - # Group by simulation and process each separately - processed_data = [] - - for sim_data_group in df.group_by(["variant", "seed", "generation"]): - sim_df = sim_data_group[1].sort("time") - - # Extract arrays for this simulation - dt = sim_df["timeStepSec"].to_numpy() - - # Process degradation counts - degraded_counts = np.column_stack( - [ - sim_df[ - f"listenersrna_degradation__count_RNA_degraded_per_cistron__{cistron_id}" - ].to_numpy() - for cistron_id in cistron_ids - ] - ) - - # Process RNA counts - rna_counts = np.column_stack( - [ - sim_df[ - f"listenersrna_counts__mRNA_cistron_counts__{cistron_id}" - ].to_numpy() - for cistron_id in cistron_ids - ] - ) - - # Apply smoothing (similar to original) - if len(dt) > 2 * N: - degraded_smoothed = np.nan * np.ones_like(degraded_counts) - counts_smoothed = np.nan * np.ones_like(rna_counts) - - for col_idx in range(degraded_counts.shape[1]): - # Smooth degradation rates - degraded_smoothed[:, col_idx] = np.convolve( - degraded_counts[:, col_idx] / dt, np.ones(N) / N, mode="same" - ) - # Smooth counts - counts_smoothed[:, col_idx] = np.convolve( - rna_counts[:, col_idx], np.ones(N) / N, mode="same" - ) - - # Trim edges - degraded_trimmed = degraded_smoothed[N:-N, :] - counts_trimmed = counts_smoothed[N:-N, :] - - processed_data.append( - { - "degraded": degraded_trimmed, - "counts": counts_trimmed, - "variant": sim_data_group[1]["variant"].iloc[0], - "seed": sim_data_group[1]["seed"].iloc[0], - "generation": sim_data_group[1]["generation"].iloc[0], - } - ) - - if not processed_data: - print("No data available for processing") - return - - # Combine all processed data - all_degraded = np.vstack([d["degraded"] for d in processed_data]) - all_counts = np.vstack([d["counts"] for d in processed_data]) - - # Create subplot charts using Altair + # Smooth and fit per cistron charts = [] - - for subplot_idx in range( - min(9, len(cistron_ids)) - ): # Limit to 9 subplots like original - if subplot_idx >= len(cistron_ids): - break - - y = all_degraded[:, subplot_idx] - A = all_counts[:, subplot_idx] - - try: - # Calculate degradation rate using least squares - kdeg, _, _, _ = np.linalg.lstsq(A[:, np.newaxis], y, rcond=None) - kdeg = kdeg[0] - except (ValueError, np.linalg.LinAlgError): - print(f"Skipping subplot {subplot_idx} because not enough data") + window = 100 + for cid in valid_ids[:9]: # up to 9 plots + sub = joined.filter(pl.col("cistron") == cid).sort("time_min") + if sub.height < 2 * window: continue - - # Subsample data for plotting (similar to original ::N) - plot_data = pl.DataFrame({"RNA_counts": A[::N], "RNA_degraded": y[::N]}) - - chart = ( - alt.Chart(plot_data) - .mark_circle() - .encode( - x=alt.X("RNA_counts:Q", title="RNA (counts)"), - y=alt.Y("RNA_degraded:Q", title="RNA degraded (counts)"), - ) - .properties( - title=f"{names[subplot_idx].split(' - ')[0]}\n" - f"kdeg meas: {kdeg:.1e}\n" - f"kdeg exp: {deg_rates[subplot_idx]:.1e}", - width=250, - height=200, - ) + counts = sub["counts"].to_numpy() + degraded = sub["degraded"].to_numpy() + # smoothing + smooth_c = np.convolve(counts, np.ones(window) / window, mode="same") + dt = np.gradient(sub["time_min"].to_numpy() * 60) + rate = degraded / np.maximum(dt, 1e-10) + smooth_r = np.convolve(rate, np.ones(window) / window, mode="same") + mask = ( + np.isfinite(smooth_c) + & (smooth_c > 0) + & np.isfinite(smooth_r) + & (smooth_r >= 0) + ) + A = smooth_c[mask] + y = smooth_r[mask] + if len(A) < 10: + continue + kdeg = np.linalg.lstsq(A[:, None], y, rcond=None)[0][0] + + # Prepare data for plotting + plot_df = pl.DataFrame({"RNA_counts": A, "RNA_degraded": y}) + # Regression line data + line_x = np.linspace(A.min(), A.max(), 100) + line_y = kdeg * line_x + + # Scatter with blue points + scatter = ( + alt.Chart(plot_df) + .mark_circle(size=20, opacity=0.6, color="blue") + .encode(x="RNA_counts:Q", y="RNA_degraded:Q") + ) + # Regression line with light yellow color + line = ( + alt.Chart(pl.DataFrame({"RNA_counts": line_x, "RNA_degraded": line_y})) + .mark_line(color="red", strokeWidth=0.5) + .encode(x="RNA_counts:Q", y="RNA_degraded:Q") ) - charts.append(chart) + # Combine and style + title = f"{cid} kdeg meas: {kdeg:.1e} s⁻¹ | kdeg exp: {deg_rates[cid]:.1e} s⁻¹" + charts.append((scatter + line).properties(title=title, width=250, height=200)) - # Arrange charts in 3x3 grid if charts: - # Group charts into rows of 3 - rows = [] - for i in range(0, len(charts), 3): - row_charts = charts[i : i + 3] - if len(row_charts) == 1: - rows.append(row_charts[0]) - else: - rows.append(alt.hconcat(*row_charts)) - - # Combine rows vertically - if len(rows) == 1: - combined_plot = rows[0] - else: - combined_plot = alt.vconcat(*rows) - - combined_plot.save(os.path.join(outdir, "rna_decay_03_high.html")) + # Arrange charts in 3x3 grid + rows = [alt.hconcat(*charts[i : i + 3]) for i in range(0, len(charts), 3)] + combined = alt.vconcat(*rows).properties( + title="RNA Decay - High Expression Genes" + ) + output = os.path.join(outdir, "rna_decay_03_high.html") + combined.save(output) + print(f"[INFO] Saved visualization to: {output}") + return combined else: - print("No charts were generated due to insufficient data") + print("[ERROR] No charts generated") + return None From 3f6d2c99648602fe70dae451c09ea7b7098493ec Mon Sep 17 00:00:00 2001 From: Sean Cheah Date: Sat, 28 Jun 2025 17:57:26 -0700 Subject: [PATCH 20/71] Do not set success flag if exception raised --- ecoli/processes/engine_process.py | 1 - runscripts/sim.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/ecoli/processes/engine_process.py b/ecoli/processes/engine_process.py index 8f9ad2418..9db5918fc 100644 --- a/ecoli/processes/engine_process.py +++ b/ecoli/processes/engine_process.py @@ -511,7 +511,6 @@ def next_update(self, timestep, states): self.sim.complete() except Exception: if isinstance(self.emitter, ParquetEmitter): - self.emitter.success = True self.emitter.finalize() raise diff --git a/runscripts/sim.py b/runscripts/sim.py index 28820644e..60f88bfb7 100644 --- a/runscripts/sim.py +++ b/runscripts/sim.py @@ -21,10 +21,10 @@ def main(): try: proc.wait() # Give subprocess chance to finish cleanly - except Exception as e: + except Exception: proc.send_signal(signal.SIGINT) proc.wait() - raise e + raise return proc.returncode From 28345d4cc967d99e23c24691561bb904956a0796 Mon Sep 17 00:00:00 2001 From: Sean Cheah Date: Sun, 29 Jun 2025 13:11:27 -0700 Subject: [PATCH 21/71] Ensure emits are finalized even if wrapper is interrupted --- runscripts/sim.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/runscripts/sim.py b/runscripts/sim.py index 60f88bfb7..85c2b45ac 100644 --- a/runscripts/sim.py +++ b/runscripts/sim.py @@ -20,11 +20,10 @@ def main(): proc = subprocess.Popen(cmd) try: proc.wait() - # Give subprocess chance to finish cleanly - except Exception: + # Ensure emits are finalized even if wrapper is interrupted + finally: proc.send_signal(signal.SIGINT) proc.wait() - raise return proc.returncode From a3bfd0501616b1703ed9c932e472456097507a2f Mon Sep 17 00:00:00 2001 From: Sean Cheah Date: Sun, 29 Jun 2025 13:14:12 -0700 Subject: [PATCH 22/71] Cleanup documentation for EcoliSim --- ecoli/experiments/ecoli_master_sim.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/ecoli/experiments/ecoli_master_sim.py b/ecoli/experiments/ecoli_master_sim.py index 3a268d16a..46125d544 100644 --- a/ecoli/experiments/ecoli_master_sim.py +++ b/ecoli/experiments/ecoli_master_sim.py @@ -737,10 +737,10 @@ def update_experiment(self, time_to_update: float = 0.0): Runs the E. coli simulation for a specified amount of time. If the simulation reaches a division event and ``config['generations']`` is set, it will save the daughter cell states to JSON files in the directory - specified by ``config['daughter_outdir']``. If the simulation reaches - the maximum duration specified by ``config['max_duration']``, it will - raise a :py:class:`~ecoli.experiments.ecoli_master_sim.TimeLimitError` - if ``config['fail_at_max_duration']`` is ``True``. + specified by ``config['daughter_outdir']``. Also creates a file + ``division_time.sh`` that, when executed, sets the environment variable + ``division_time`` to the time at which division occurred (used in + Nextflow workflow runs). """ try: self.ecoli_experiment.update(time_to_update) @@ -754,7 +754,7 @@ def update_experiment(self, time_to_update: float = 0.0): ) write_json(daughter_path, agent_state) print( - f"Divided at t = {self.ecoli_experiment.global_time} after" + f"Divided at t = {self.ecoli_experiment.global_time} after " f"{self.ecoli_experiment.global_time - self.initial_global_time} sec." ) with open("division_time.sh", "w") as f: @@ -763,8 +763,10 @@ def update_experiment(self, time_to_update: float = 0.0): if isinstance(self.ecoli_experiment.emitter, ParquetEmitter): self.ecoli_experiment.emitter.success = True self.ecoli_experiment.emitter.finalize() + # Exit so that EcoliSim.run() does not raise TimeLimitError sys.exit() finally: + # Finish writing any buffered emits to Parquet files if isinstance(self.ecoli_experiment.emitter, ParquetEmitter): self.ecoli_experiment.emitter.finalize() @@ -804,7 +806,10 @@ def save_states(self): self.update_experiment(time_remaining) def run(self): - """Create and run an EcoliSim experiment. + """Create and run an EcoliSim experiment. If the simulation reaches + the maximum duration specified by ``config['max_duration']``, it will + raise a :py:class:`~ecoli.experiments.ecoli_master_sim.TimeLimitError` + if ``config['fail_at_max_duration']`` is ``True``. .. WARNING:: Run :py:meth:`~ecoli.experiments.ecoli_master_sim.EcoliSim.build_ecoli` From 8a44960db65721301281fc846e737df1e06dd313 Mon Sep 17 00:00:00 2001 From: Sean Cheah Date: Sun, 29 Jun 2025 13:40:29 -0700 Subject: [PATCH 23/71] Handle keyboard interrupt in engine process --- ecoli/processes/engine_process.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ecoli/processes/engine_process.py b/ecoli/processes/engine_process.py index 9db5918fc..c7bea28da 100644 --- a/ecoli/processes/engine_process.py +++ b/ecoli/processes/engine_process.py @@ -509,7 +509,7 @@ def next_update(self, timestep, states): self.sim.run_for(timestep) if force_complete: self.sim.complete() - except Exception: + except (Exception, KeyboardInterrupt): if isinstance(self.emitter, ParquetEmitter): self.emitter.finalize() raise From bb604d5a7b6964a6eafe966a527d78c9ddc89274 Mon Sep 17 00:00:00 2001 From: Shu-Feather <2200017797@stu.pku.edu.cn> Date: Mon, 30 Jun 2025 10:35:58 +0800 Subject: [PATCH 24/71] BUG FIX at ecoli_master_sim.py --- ecoli/experiments/ecoli_master_sim.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/ecoli/experiments/ecoli_master_sim.py b/ecoli/experiments/ecoli_master_sim.py index f17cc8152..cadf0c9b8 100644 --- a/ecoli/experiments/ecoli_master_sim.py +++ b/ecoli/experiments/ecoli_master_sim.py @@ -117,10 +117,16 @@ def get_git_diff() -> str: Raises an error if both methods fail. """ # Try to run git command + # try: + # return ( + # subprocess.check_output(["git", "-C", CONFIG_DIR_PATH, "diff", "HEAD"]) + # .decode("ascii") + # .strip() + # ) try: return ( subprocess.check_output(["git", "-C", CONFIG_DIR_PATH, "diff", "HEAD"]) - .decode("ascii") + .decode("utf-8") .strip() ) except (subprocess.CalledProcessError, FileNotFoundError): From cad30444594a4cfb8a6766ca9c5e1a5513f5c4ed Mon Sep 17 00:00:00 2001 From: Sean Cheah Date: Mon, 30 Jun 2025 16:55:25 -0700 Subject: [PATCH 25/71] Specify uv install method --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2aa1603d7..c0edcbe5e 100644 --- a/README.md +++ b/README.md @@ -68,7 +68,7 @@ git clone https://github.com/CovertLab/vEcoli.git > a new directory called `vEcoli` in your current directory. To speed up > the clone and save disk space, add `--filter=blob:none` to the command. -2. [Follow these instructions](https://docs.astral.sh/uv/getting-started/installation/) +2. [Follow these "Standalone installer" instructions](https://docs.astral.sh/uv/getting-started/installation/) to install `uv`, our Python package and project manager of choice. 3. Close and reopen your terminal. From 1572e25a4f42d3abc8276afb3049ec759c0a50be Mon Sep 17 00:00:00 2001 From: Shu-Feather <2200017797@stu.pku.edu.cn> Date: Wed, 2 Jul 2025 07:15:41 +0800 Subject: [PATCH 26/71] File_path modification for analysis.py --- runscripts/analysis.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/runscripts/analysis.py b/runscripts/analysis.py index d8ad813b1..6445bba9e 100644 --- a/runscripts/analysis.py +++ b/runscripts/analysis.py @@ -293,7 +293,6 @@ def main(): variant_names = {config["experiment_id"][0]: variant_name} # Establish DuckDB connection - # print(f"[DEBUG] The out_uri for analyses is: {out_uri}") conn = create_duckdb_conn(out_uri, gcs_bucket, config.get("cpus")) history_sql, config_sql, success_sql = dataset_sql(out_uri, config["experiment_id"]) # If no explicit analysis type given, run all types in config JSON @@ -340,6 +339,8 @@ def main(): curr_outdir, ) else: + curr_outdir = os.path.abspath(config["outdir"]) + os.makedirs(curr_outdir, exist_ok=True) query_strings[duckdb_filter] = ( f"SELECT * FROM ({history_sql}) WHERE {duckdb_filter}", f"SELECT * FROM ({config_sql}) WHERE {duckdb_filter}", @@ -370,9 +371,6 @@ def main(): variant_names, ) - top_outdir = os.path.abspath(config["outdir"]) - os.makedirs(top_outdir, exist_ok=True) - # Save copy of config JSON with parameters for plots with open( os.path.join(os.path.abspath(config["outdir"]), "metadata.json"), "w" From 9fbe9fd136c94628df92304dd350ab77ac4240f6 Mon Sep 17 00:00:00 2001 From: Shu-Feather <2200017797@stu.pku.edu.cn> Date: Wed, 2 Jul 2025 07:19:15 +0800 Subject: [PATCH 27/71] Modification for 5 multigeneration analysing methods --- ecoli/analysis/multigeneration/replication.py | 145 ++---- .../multigeneration/ribosome_crowding.py | 1 + .../multigeneration/ribosome_production.py | 389 +++++++++++++++ .../multigeneration/ribosome_usage.py | 448 ++++++++++++++++++ .../multigeneration/rna_decay_03_high.py | 36 +- 5 files changed, 908 insertions(+), 111 deletions(-) create mode 100644 ecoli/analysis/multigeneration/ribosome_production.py create mode 100644 ecoli/analysis/multigeneration/ribosome_usage.py diff --git a/ecoli/analysis/multigeneration/replication.py b/ecoli/analysis/multigeneration/replication.py index fb1ef7d39..9498bf187 100644 --- a/ecoli/analysis/multigeneration/replication.py +++ b/ecoli/analysis/multigeneration/replication.py @@ -1,5 +1,5 @@ """ -The multigeneration aanlysis method `replication` +The multigeneration analysis method `replication` 1. Record the DNA polymerase position vs time 2. Record # of pairs of replication forks 3. Record the factors of critical initial mass and dry mass @@ -13,7 +13,6 @@ from duckdb import DuckDBPyConnection import polars as pl -import numpy as np from ecoli.library.parquet_emitter import ( open_arbitrary_sim_data, @@ -43,114 +42,57 @@ def plot( sim_data = pickle.load(f) genome_length = len(sim_data.process.replication.genome_sequence) - # Discover available columns - result = conn.sql(f"DESCRIBE ({history_sql})").pl() - available_columns = result["column_name"].to_list() - - # Filter for relevant columns - replication_columns = [ - col for col in available_columns if "replication" in col.lower() + # Define data columns with proper listener names and aliases + data_columns = [ + 'time / 3600 AS "Time (hr)"', + "listeners__replication_data__fork_coordinates AS fork_coordinates", + "listeners__replication_data__number_of_oric AS number_of_oric", + "listeners__mass__cell_mass AS cell_mass", + "listeners__mass__dry_mass AS dry_mass", + "listeners__replication_data__critical_initiation_mass AS critical_initiation_mass", + "listeners__replication_data__critical_mass_per_oric AS critical_mass_per_oric", ] - mass_columns = [col for col in available_columns if "mass" in col.lower()] - - print( - f"Found {len(replication_columns)} replication columns and {len(mass_columns)} mass columns" - ) - - # Define required columns mapping - column_mapping = { - "number_of_oric": next( - (col for col in available_columns if "number_of_oric" in col), None - ), - "fork_coordinates": next( - (col for col in available_columns if "fork_coordinates" in col), None - ), - "cell_mass": next( - ( - col - for col in available_columns - if "cell_mass" in col and "fold_change" not in col - ), - None, - ), - "dry_mass": next( - ( - col - for col in available_columns - if "dry_mass" in col and "fold_change" not in col - ), - None, - ), - "critical_initiation_mass": next( - (col for col in available_columns if "critical_initiation_mass" in col), - None, - ), - "critical_mass_per_oric": next( - (col for col in available_columns if "critical_mass_per_oric" in col), None - ), - } - - # Build list of columns to load - data_columns = ["time"] - for key, col_name in column_mapping.items(): - if col_name: - data_columns.append(col_name) - print(f"Using {col_name} for {key}") # Load data plot_data = read_stacked_columns(history_sql, data_columns, conn=conn) - # Convert to DataFrame and add time in hours - df = pl.DataFrame(plot_data).with_columns( - pl.col("time").truediv(3600).alias("Time (hr)") - ) - - print(f"Loaded data: {df.shape[0]} rows, {df.shape[1]} columns") - - # Process fork coordinates and calculate pairs of forks - if column_mapping["fork_coordinates"]: - fork_coords_col = column_mapping["fork_coordinates"] - pairs_of_forks = [] + # Convert to DataFrame + df = pl.DataFrame(plot_data) - for coord_array in df[fork_coords_col].to_numpy(): - if coord_array is not None and len(coord_array) > 0: - # Count non-NaN coordinates and divide by 2 for pairs - pairs_of_forks.append(np.sum(~np.isnan(coord_array)) / 2) - else: - pairs_of_forks.append(0) - - df = df.with_columns(pl.Series("pairs_of_forks", pairs_of_forks)) + # Process fork coordinates and calculate pairs of forks using Polars + if "fork_coordinates" in df.columns: + df = df.with_columns( + pairs_of_forks=pl.col("fork_coordinates") + .list.eval(~pl.element().is_nan()) + .list.sum() + / 2 + ) # Calculate critical mass equivalents - if column_mapping["cell_mass"] and column_mapping["critical_initiation_mass"]: + if "cell_mass" in df.columns and "critical_initiation_mass" in df.columns: df = df.with_columns( - ( - pl.col(column_mapping["cell_mass"]) - / pl.col(column_mapping["critical_initiation_mass"]) - ).alias("critical_mass_equivalents") + critical_mass_equivalents=( + pl.col("cell_mass") / pl.col("critical_initiation_mass") + ) ) + # ----------------------------------------- # # Create visualization functions def create_fork_positions_plot(): """Create DNA polymerase positions scatter plot.""" - if not column_mapping["fork_coordinates"]: + if "fork_coordinates" not in df.columns: return None - fork_positions_data = [] - fork_coords_col = column_mapping["fork_coordinates"] - - for time_val, coords in zip(df["Time (hr)"], df[fork_coords_col]): - if coords is not None and len(coords) > 0: - for coord in coords: - if not np.isnan(coord): - fork_positions_data.append( - {"Time (hr)": time_val, "Position": coord} - ) + # Explode fork coordinates and filter out NaN values + fork_df = ( + df.select(["Time (hr)", "fork_coordinates"]) + .explode("fork_coordinates") + .filter(~pl.col("fork_coordinates").is_nan()) + .rename({"fork_coordinates": "Position"}) + ) - if not fork_positions_data: + if fork_df.height == 0: return None - - fork_df = pl.DataFrame(fork_positions_data) return ( alt.Chart(fork_df) .mark_circle(size=5, opacity=0.7) @@ -175,7 +117,7 @@ def create_pairs_of_forks_plot(): return None return ( - alt.Chart(df.to_pandas()) + alt.Chart(df) .mark_line(strokeWidth=2) .encode( x=alt.X("Time (hr):Q", title="Time (hr)"), @@ -195,7 +137,7 @@ def create_critical_mass_plot(): # Main line plot base_plot = ( - alt.Chart(df.to_pandas()) + alt.Chart(df) .mark_line(strokeWidth=2) .encode( x=alt.X("Time (hr):Q", title="Time (hr)"), @@ -212,14 +154,14 @@ def create_critical_mass_plot(): ) reference_lines = ( - alt.Chart(reference_data.to_pandas()) + alt.Chart(reference_data) .mark_rule(strokeDash=[5, 5], color="gray", opacity=0.7) .encode(y="y:Q") ) # Text labels for reference lines reference_labels = ( - alt.Chart(reference_data.to_pandas()) + alt.Chart(reference_data) .mark_text(align="left", dx=5, fontSize=10, color="gray") .encode(y="y:Q", text="label:N") .transform_calculate(x="0") @@ -230,21 +172,22 @@ def create_critical_mass_plot(): title="Factors of Critical Initiation Mass", width=600, height=100 ) - def create_mass_plot(column_key: str, title: str, y_title: str): + def create_mass_plot(column_name: str, title: str, y_title: str): """Create a generic mass plot.""" - if not column_mapping[column_key]: + if column_name not in df.columns: return None return ( - alt.Chart(df.to_pandas()) + alt.Chart(df) .mark_line(strokeWidth=2) .encode( x=alt.X("Time (hr):Q", title="Time (hr)"), - y=alt.Y(f"{column_mapping[column_key]}:Q", title=y_title), + y=alt.Y(f"{column_name}:Q", title=y_title), ) .properties(title=title, width=600, height=100) ) + # ----------------------------------------- # # Generate all plots plots = [] @@ -290,7 +233,7 @@ def create_mass_plot(column_key: str, title: str, y_title: str): {"x": [0], "y": [0], "text": ["No data available for plotting"]} ) combined_plot = ( - alt.Chart(fallback_data.to_pandas()) + alt.Chart(fallback_data) .mark_text(fontSize=20, color="red") .encode(x=alt.X("x:Q", axis=None), y=alt.Y("y:Q", axis=None), text="text:N") .properties(width=600, height=400, title="Replication Data Visualization") diff --git a/ecoli/analysis/multigeneration/ribosome_crowding.py b/ecoli/analysis/multigeneration/ribosome_crowding.py index 559ae95c8..dc98402cb 100644 --- a/ecoli/analysis/multigeneration/ribosome_crowding.py +++ b/ecoli/analysis/multigeneration/ribosome_crowding.py @@ -260,6 +260,7 @@ def plot( scale=alt.Scale( domain=["target", "actual"], range=[[1, 0], [5, 5]] ), + legend=None, ), tooltip=[ alt.Tooltip("Time_min:Q", title="Time (min)", format=".2f"), diff --git a/ecoli/analysis/multigeneration/ribosome_production.py b/ecoli/analysis/multigeneration/ribosome_production.py new file mode 100644 index 000000000..f42e1e4b9 --- /dev/null +++ b/ecoli/analysis/multigeneration/ribosome_production.py @@ -0,0 +1,389 @@ +import os +from typing import Any +import altair as alt +import pickle +import polars as pl +import numpy as np +from duckdb import DuckDBPyConnection +import pandas as pd + +from ecoli.library.parquet_emitter import open_arbitrary_sim_data, named_idx +from ecoli.library.schema import bulk_name_to_idx + + +# ----------------------------------------- # + + +def calc_rna_doubling_time( + produced_col: str, count_col: str, borderline: float +) -> pl.Expr: + """ + Calculate rRNA doubling time with sanitation. + """ + production_rate = pl.col(produced_col) / pl.col("time_step_sec") + growth_rate = production_rate / pl.col(count_col) + dt_min = np.log(2) / growth_rate / 60 + valid = ( + (pl.col(produced_col) >= 0) + & (pl.col(count_col) > 0) + & (growth_rate > 0) + & dt_min.is_finite() + & (dt_min > 0) + & (dt_min < 2 * borderline) + ) + return pl.when(valid).then(dt_min).otherwise(None) + + +def plot( + params: dict[str, Any], + conn: DuckDBPyConnection, + history_sql: str, + config_sql: str, + success_sql: str, + sim_data_dict: dict[str, dict[int, str]], + validation_data_paths: list[str], + outdir: str, + variant_metadata: dict[str, dict[int, Any]], + variant_names: dict[str, str], +): + """Visualize ribosome production metrics for E. coli simulation.""" + with open_arbitrary_sim_data(sim_data_dict) as f: + sim_data = pickle.load(f) + + sim_doubling_time = sim_data.doubling_time.asNumber() + + # define rRNA groups and bulk IDs + s30_16s = list(sim_data.molecule_groups.s30_16s_rRNA) + [ + sim_data.molecule_ids.s30_full_complex + ] + s50_23s = list(sim_data.molecule_groups.s50_23s_rRNA) + [ + sim_data.molecule_ids.s50_full_complex + ] + s50_5s = list(sim_data.molecule_groups.s50_5s_rRNA) + [ + sim_data.molecule_ids.s50_full_complex + ] + bulk_ids = sim_data.internal_state.bulk_molecules.bulk_data["id"].tolist() + + # precompute indices as Python ints + idx_16s = [int(i) for i in bulk_name_to_idx(s30_16s, bulk_ids)] + idx_23s = [int(i) for i in bulk_name_to_idx(s50_23s, bulk_ids)] + idx_5s = [int(i) for i in bulk_name_to_idx(s50_5s, bulk_ids)] + + required_columns = [ + "time", + "variant", + "generation", + "agent_id", + "listeners__mass__instantaneous_growth_rate", + "listeners__mass__dry_mass", + "listeners__ribosome_data__rRNA16S_initiated", + "listeners__ribosome_data__rRNA23S_initiated", + "listeners__ribosome_data__rRNA5S_initiated", + "listeners__ribosome_data__rRNA16S_init_prob", + "listeners__ribosome_data__rRNA23S_init_prob", + "listeners__ribosome_data__rRNA5S_init_prob", + "listeners__ribosome_data__effective_elongation_rate", + "listeners__unique_molecule_counts__active_ribosome", + "bulk", + ] + + # load data + # Extract each bulk index into its own column using named_idx(), then sum per rRNA species + idx_groups = {"bulk_16s": idx_16s, "bulk_23s": idx_23s, "bulk_5s": idx_5s} + projections = ( + required_columns + + [ + named_idx(col="bulk", names=[f"{grp}_{i}"], idx=[[i]], zero_to_null=True) + for grp, idxs in idx_groups.items() + for i in idxs + ] + + [ + f"({' + '.join(f'{grp}_{i}' for i in idxs)}) AS {grp}_count" + for grp, idxs in idx_groups.items() + ] + ) + + sql = f""" + SELECT {", ".join(projections)} + FROM ({history_sql}) + WHERE agent_id = 0 + ORDER BY generation, time + """ + df = conn.sql(sql).pl() + + # time + df = df.with_columns((pl.col("time") / 60).alias("time_min")) + + df = df.with_columns( + pl.col("time") + .diff() + .over(["variant", "generation", "agent_id"]) + .alias("time_step_sec") + ) + df = df.with_columns( + time_step_sec=pl.when(pl.col("time_step_sec").is_null()) + .then(pl.col("time")) + .otherwise(pl.col("time_step_sec")) + ) + + # cell doubling time + if "listeners__mass__instantaneous_growth_rate" in df.columns: + val = np.log(2) / pl.col("listeners__mass__instantaneous_growth_rate") / 60 + df = df.with_columns( + pl.when(val.is_between(0, 2 * sim_doubling_time, closed="both")) + .then(val) + .otherwise(None) + .alias("cell_doubling_time_min") + ) + + df = df.with_columns( + [ + # compute bulk rRNA counts + pl.col("bulk") + .map_elements( + lambda arr: sum(arr[i] for i in idx_16s if i < len(arr)), + return_dtype=pl.Float64, + ) + .fill_null(0) + .alias("bulk_16s_count"), + pl.col("bulk") + .map_elements( + lambda arr: sum(arr[i] for i in idx_23s if i < len(arr)), + return_dtype=pl.Float64, + ) + .fill_null(0) + .alias("bulk_23s_count"), + pl.col("bulk") + .map_elements( + lambda arr: sum(arr[i] for i in idx_5s if i < len(arr)), + return_dtype=pl.Float64, + ) + .fill_null(0) + .alias("bulk_5s_count"), + # compute unique ribosomes + pl.col("listeners__unique_molecule_counts__active_ribosome") + .fill_null(0) + .alias("ribosome_count"), + ] + ) + + # total rRNA + df = df.with_columns( + [ + (pl.col("bulk_16s_count") + pl.col("ribosome_count")).alias("rrn16s_count"), + (pl.col("bulk_23s_count") + pl.col("ribosome_count")).alias("rrn23s_count"), + (pl.col("bulk_5s_count") + pl.col("ribosome_count")).alias("rrn5s_count"), + ] + ) + + # rRNA doubling times + if "listeners__ribosome_data__rRNA16S_initiated" in df.columns: + df = df.with_columns( + rrn16S_doubling_time_min=calc_rna_doubling_time( + "listeners__ribosome_data__rRNA16S_initiated", + "rrn16s_count", + sim_doubling_time, + ) + ) + if "listeners__ribosome_data__rRNA23S_initiated" in df.columns: + df = df.with_columns( + rrn23S_doubling_time_min=calc_rna_doubling_time( + "listeners__ribosome_data__rRNA23S_initiated", + "rrn23s_count", + sim_doubling_time, + ) + ) + if "listeners__ribosome_data__rRNA5S_initiated" in df.columns: + df = df.with_columns( + rrn5S_doubling_time_min=calc_rna_doubling_time( + "listeners__ribosome_data__rRNA5S_initiated", + "rrn5s_count", + sim_doubling_time, + ) + ) + + # reference probabilities + cond = sim_data.condition + trans = sim_data.process.transcription + synth_probs = trans.cistron_tu_mapping_matrix.dot(trans.rna_synth_prob[cond]) + + def fit_prob(group_ids): + cistrons = [rid[:-3] for rid in group_ids] + idxs = np.where(np.isin(trans.cistron_data["id"], cistrons))[0] + return synth_probs[idxs].sum() if idxs.size else 0.0 + + ref_probs = { + "16S": fit_prob(sim_data.molecule_groups.s30_16s_rRNA), + "23S": fit_prob(sim_data.molecule_groups.s50_23s_rRNA), + "5S": fit_prob(sim_data.molecule_groups.s50_5s_rRNA), + } + + # ----------------------------------------- # + # prepare for plotting + plot_cols = ["time_min", "variant", "generation"] + + for c in [ + "listeners__mass__dry_mass", + "cell_doubling_time_min", + "rrn16S_doubling_time_min", + "rrn23S_doubling_time_min", + "rrn5S_doubling_time_min", + "rrn16S_init_prob", + "rrn23S_init_prob", + "rrn5S_init_prob", + "listeners__ribosome_data__effective_elongation_rate", + ]: + if c in df.columns: + plot_cols.append(c) + + plot_df = df.select(plot_cols) + + init_dm = ( + plot_df.filter(pl.col("time_min") == 0) + .select(["variant", "listeners__mass__dry_mass"]) + .rename({"listeners__mass__dry_mass": "initial_dry_mass"}) + ) + plot_df = plot_df.join(init_dm, on=["variant"], how="left") + plot_df = plot_df.with_columns( + (pl.col("listeners__mass__dry_mass") / pl.col("initial_dry_mass")).alias( + "dry_mass_normalized" + ) + ) + + # generate Altair charts + def create_line_chart(y, title, y_title, ref=None): + base = alt.Chart(plot_df) + line = ( + base.mark_line() + .encode( + x=alt.X("time_min:Q", title="Time (min)"), + y=alt.Y(f"{y}:Q", title=y_title), + color=alt.Color( + "generation:N", + legend=alt.Legend(title="Simulated Multigeneration Data"), + ), + ) + .properties(title=title, width=600, height=120) + ) + if ref is not None: + rule = ( + alt.Chart(pd.DataFrame({"y": [ref]})) + .mark_rule(color="red", strokeDash=[5, 5]) + .encode(y="y:Q") + ) + return line + rule + return line + + def create_histogram( + col: str, title: str, bins: int = 30, probability: bool = False + ) -> alt.Chart: + if probability: + density = ( + alt.Chart(plot_df) + .transform_density(col, as_=[col, "density"], counts=False, steps=bins) + .mark_area(opacity=0.6) + .encode( + x=alt.X(f"{col}:Q", title=f"bin={bins}"), + y=alt.Y("density:Q", title="Density"), + ) + .properties(width=200, height=120, title=title) + ) + return density + else: + hist = ( + alt.Chart(plot_df) + .mark_bar(opacity=0.6) + .encode( + x=alt.X(f"{col}:Q", bin=alt.Bin(maxbins=bins), title=f"bin={bins}"), + y=alt.Y("count():Q", title="Count"), + color=alt.value("steelblue"), + ) + .properties(width=200, height=120, title=title) + ) + return hist + + plots = [] + # Dry mass + if "dry_mass_normalized" in plot_df.columns: + line = create_line_chart( + "dry_mass_normalized", + "Normalized Dry Mass Over Time", + "Dry mass (relative to t=0)", + ) + hist = create_histogram( + "dry_mass_normalized", "Normalized Dry Mass Distribution", probability=True + ) + plots.append(alt.hconcat(line, hist)) + # Cell Doubling Time + if "cell_doubling_time_min" in plot_df.columns: + line = create_line_chart( + "cell_doubling_time_min", + "Cell Doubling Time", + "Doubling Time (min)", + sim_doubling_time, + ) + hist = create_histogram( + "cell_doubling_time_min", + "Cell Doubling Time (min) Distribution", + probability=True, + ) + plots.append(alt.hconcat(line, hist)) + # rRNA Doubl;ing Time + for suffix in ["16S", "23S", "5S"]: + col = f"rrn{suffix}_doubling_time_min" + if col in plot_df.columns: + line = create_line_chart( + col, + f"{suffix} rRNA Doubling Time", + "Doubling Time (min)", + sim_doubling_time, + ) + hist = create_histogram( + col, f"{suffix} rRNA Doubling Time Distribution", probability=True + ) + plots.append(alt.hconcat(line, hist)) + # rRNA Initiation Probability + for suffix, ref in ref_probs.items(): + col = f"rrn{suffix}_init_prob" + if col in plot_df.columns: + line = create_line_chart( + col, f"{suffix} rRNA Initiation Probability", "Probability", ref + ) + hist = create_histogram( + col, + f"{suffix} rRNA Initiation Probability Distribution", + probability=True, + ) + plots.append(alt.hconcat(line, hist)) + # Ribosome Elongation Rate + if "listeners__ribosome_data__effective_elongation_rate" in plot_df.columns: + line = create_line_chart( + "listeners__ribosome_data__effective_elongation_rate", + "Ribosome Elongation Rate", + "Amino acids/s", + ) + hist = create_histogram( + "listeners__ribosome_data__effective_elongation_rate", + "Ribosome Elongation Rate Distribution", + probability=True, + ) + plots.append(alt.hconcat(line, hist)) + + if not plots: + fallback = pl.DataFrame({"message": ["No data available"], "x": [0], "y": [0]}) + plots.append( + alt.Chart(fallback.to_pandas()) + .mark_text(size=20, color="red") + .encode(x="x:Q", y="y:Q", text="message:N") + .properties(width=600, height=400, title="No Data") + ) + + combined = ( + alt.vconcat(*plots) + .resolve_scale(x="shared", y="independent") + .properties(title="Ribosome Production Metrics") + ) + out_path = os.path.join(outdir, "ribosome_production_report.html") + combined.save(out_path) + print(f"Saved visualization to: {out_path}") + return combined diff --git a/ecoli/analysis/multigeneration/ribosome_usage.py b/ecoli/analysis/multigeneration/ribosome_usage.py new file mode 100644 index 000000000..184090134 --- /dev/null +++ b/ecoli/analysis/multigeneration/ribosome_usage.py @@ -0,0 +1,448 @@ +""" +Record several things: +1. cell volume over time +2. total / active ribosome count and concentration +3. active ribosome molar / mass fraction +4. Ribosome activation / deactivation count +5. # of AA. be translated +6. the effective ribosome elongation rate +""" + +import altair as alt +import os +from typing import Any +import pickle + +import polars as pl +from duckdb import DuckDBPyConnection +import pandas as pd + +from ecoli.library.parquet_emitter import ( + open_arbitrary_sim_data, +) +from ecoli.library.schema import bulk_name_to_idx + +# ----------------------------------------- # + + +def plot( + params: dict[str, Any], + conn: DuckDBPyConnection, + history_sql: str, + config_sql: str, + success_sql: str, + sim_data_dict: dict[str, dict[int, str]], + validation_data_paths: list[str], + outdir: str, + variant_metadata: dict[str, dict[int, Any]], + variant_names: dict[str, str], +): + """Visualize ribosome usage statistics for E. coli simulation.""" + # Load sim_data + with open_arbitrary_sim_data(sim_data_dict) as f: + sim_data = pickle.load(f) + + # Get molecular IDs for ribosome subunits + complex_ids_30s = [sim_data.molecule_ids.s30_full_complex] + complex_ids_50s = [sim_data.molecule_ids.s50_full_complex] + bulk_ids = sim_data.internal_state.bulk_molecules.bulk_data["id"].tolist() + + # precompute indices as Python ints (following ribosome_production.py pattern) + idx_30s = [int(i) for i in bulk_name_to_idx(complex_ids_30s, bulk_ids)] + idx_50s = [int(i) for i in bulk_name_to_idx(complex_ids_50s, bulk_ids)] + + # Get molecular weights + n_avogadro = sim_data.constants.n_avogadro + mw_30s = sim_data.getter.get_masses(complex_ids_30s) + mw_50s = sim_data.getter.get_masses(complex_ids_50s) + mw_70s = mw_30s + mw_50s + + required_columns = [ + "time", + "variant", + "generation", + "agent_id", + "experiment_id", + "lineage_seed", + "listeners__mass__instantaneous_growth_rate", + "listeners__mass__cell_mass", + "listeners__mass__volume", + "listeners__ribosome_data__did_initialize", + "listeners__ribosome_data__actual_elongations", + "listeners__ribosome_data__did_terminate", + "listeners__ribosome_data__effective_elongation_rate", + "listeners__unique_molecule_counts__active_ribosome", + "bulk", + ] + + # Check available columns + available_columns = ( + conn.sql(f"DESCRIBE ({history_sql})").pl()["column_name"].to_list() + ) + data_columns = [col for col in required_columns if col in available_columns] + + print(f"[INFO] Loading {len(data_columns)} columns for ribosome usage analysis") + + df = conn.sql(f""" + SELECT {", ".join(data_columns)} + FROM ({history_sql}) + WHERE agent_id = 0 + ORDER BY variant, generation, time + """).pl() + + # Convert time + if "time" in df.columns: + df = df.with_columns((pl.col("time") / 60).alias("time_min")) + df = df.with_columns([(pl.col("time") + 1).alias("time_step_sec")]) + + # Calculate ribosome subunit counts + df = df.with_columns( + [ + # compute bulk ribosome subunit counts + pl.col("bulk") + .map_elements( + lambda arr: sum(arr[i] for i in idx_30s if i < len(arr)), + return_dtype=pl.Float64, + ) + .fill_null(0) + .alias("counts_30s"), + pl.col("bulk") + .map_elements( + lambda arr: sum(arr[i] for i in idx_50s if i < len(arr)), + return_dtype=pl.Float64, + ) + .fill_null(0) + .alias("counts_50s"), + # compute unique ribosomes + pl.col("listeners__unique_molecule_counts__active_ribosome") + .fill_null(0) + .alias("active_ribosome_counts"), + ] + ) + + # Calculate total ribosome counts and fractions + df = df.with_columns( + [ + ( + pl.col("active_ribosome_counts") + + pl.min_horizontal(pl.col("counts_30s"), pl.col("counts_50s")) + ).alias("total_ribosome_counts"), + ( + pl.col("active_ribosome_counts").cast(pl.Float64) + / ( + pl.col("active_ribosome_counts") + + pl.min_horizontal(pl.col("counts_30s"), pl.col("counts_50s")) + ) + ).alias("molar_fraction_active"), + ] + ) + + if "listeners__mass__cell_mass" in df.columns: + cell_density = sim_data.constants.cell_density.asNumber() + df = df.with_columns( + (1e-15 * pl.col("listeners__mass__cell_mass") / cell_density).alias( + "cell_volume" + ) + ) + + # Calculate concentrations + df = df.with_columns( + [ + ( + pl.col("total_ribosome_counts") + / n_avogadro.asNumber() + / pl.col("cell_volume") + ).alias("total_ribosome_concentration_mM"), + ( + pl.col("active_ribosome_counts") + / n_avogadro.asNumber() + / pl.col("cell_volume") + ).alias("active_ribosome_concentration_mM"), + ] + ) + + # Calculate masses + mw_30s_value = mw_30s.asNumber() if hasattr(mw_30s, "asNumber") else float(mw_30s) + mw_50s_value = mw_50s.asNumber() if hasattr(mw_50s, "asNumber") else float(mw_50s) + mw_70s_value = mw_70s.asNumber() if hasattr(mw_70s, "asNumber") else float(mw_70s) + + df = df.with_columns( + [ + (pl.col("counts_30s") / n_avogadro.asNumber() * mw_30s_value).alias( + "mass_30s" + ), + (pl.col("counts_50s") / n_avogadro.asNumber() * mw_50s_value).alias( + "mass_50s" + ), + ( + pl.col("active_ribosome_counts") / n_avogadro.asNumber() * mw_70s_value + ).alias("active_ribosome_mass"), + ] + ) + + df = df.with_columns( + [ + ( + pl.col("active_ribosome_mass") + pl.col("mass_30s") + pl.col("mass_50s") + ).alias("total_ribosome_mass"), + ( + pl.col("active_ribosome_mass") + / ( + pl.col("active_ribosome_mass") + + pl.col("mass_30s") + + pl.col("mass_50s") + ) + ).alias("mass_fraction_active"), + ] + ) + + # Calculate rates per time and volume + if "time_step_sec" in df.columns and "cell_volume" in df.columns: + df = df.with_columns( + [ + ( + pl.col("listeners__ribosome_data__did_initialize") + / (pl.col("cell_volume") / 1e-15) + ).alias("activations_per_volume"), + ( + pl.col("listeners__ribosome_data__did_terminate") + / (pl.col("cell_volume") / 1e-15) + ).alias("deactivations_per_volume"), + ] + ) + + # Select columns for plotting + plot_columns = ["time_min", "variant", "generation"] + + # Add other columns that exist + for col in [ + "time_step_sec", + "cell_volume", + "total_ribosome_counts", + "total_ribosome_concentration_mM", + "active_ribosome_counts", + "active_ribosome_concentration_mM", + "molar_fraction_active", + "mass_fraction_active", + "listeners__ribosome_data__did_initialize", + "listeners__ribosome_data__did_terminate", + "activations_per_volume", + "deactivations_per_volume", + "listeners__ribosome_data__actual_elongations", + "listeners__ribosome_data__effective_elongation_rate", + ]: + if col in df.columns: + plot_columns.append(col) + + plot_df = df.select(plot_columns) + + # ----------------------------------------- # + + def create_line_chart(y_field, title, y_title, skip_first_point=False): + """Create line chart with optional skipping of first data point.""" + data = plot_df.to_pandas() + if skip_first_point: + # Group by variant and generation, skip first point of each group + filtered_data = [] + for (variant, generation), group in data.groupby(["variant", "generation"]): + if len(group) > 1: + filtered_data.append(group.iloc[1:]) + else: + filtered_data.append(group) + data = ( + pd.concat(filtered_data, ignore_index=True) if filtered_data else data + ) + + chart = ( + alt.Chart(data) + .mark_line() + .encode( + x=alt.X("time_min:Q", title="Time (min)"), + y=alt.Y(f"{y_field}:Q", title=y_title), + color=alt.Color("generation:N", legend=alt.Legend(title="Variant")), + ) + .properties(title=title, width=600, height=120) + ) + + return chart + + # ----------------------------------------- # + plots = [] + + # Create all 14 plots following the original order + if "time_step_sec" in plot_df.columns: + plots.append( + create_line_chart( + "time_step_sec", "Length of Time Step", "Length of time step (s)" + ) + ) + + if "cell_volume" in plot_df.columns: + plots.append(create_line_chart("cell_volume", "Cell Volume", "Cell volume (L)")) + + if "total_ribosome_counts" in plot_df.columns: + plots.append( + create_line_chart( + "total_ribosome_counts", "Total Ribosome Count", "Total ribosome count" + ) + ) + + if "total_ribosome_concentration_mM" in plot_df.columns: + plots.append( + create_line_chart( + "total_ribosome_concentration_mM", + "Total Ribosome Concentration", + "[Total ribosome] (mM)", + ) + ) + + if "active_ribosome_counts" in plot_df.columns: + plots.append( + create_line_chart( + "active_ribosome_counts", + "Active Ribosome Count", + "Active ribosome count", + skip_first_point=True, + ) + ) + + if "active_ribosome_concentration_mM" in plot_df.columns: + plots.append( + create_line_chart( + "active_ribosome_concentration_mM", + "Active Ribosome Concentration", + "[Active ribosome] (mM)", + skip_first_point=True, + ) + ) + + if "molar_fraction_active" in plot_df.columns: + plots.append( + create_line_chart( + "molar_fraction_active", + "Molar Fraction Active Ribosomes", + "Molar fraction active ribosomes", + skip_first_point=True, + ) + ) + + if "mass_fraction_active" in plot_df.columns: + plots.append( + create_line_chart( + "mass_fraction_active", + "Mass Fraction Active Ribosomes", + "Mass fraction active ribosomes", + skip_first_point=True, + ) + ) + + if "listeners__ribosome_data__did_initialize" in plot_df.columns: + plots.append( + create_line_chart( + "listeners__ribosome_data__did_initialize", + "Ribosome Activations", + "Activations per timestep", + ) + ) + + if "listeners__ribosome_data__did_terminate" in plot_df.columns: + plots.append( + create_line_chart( + "listeners__ribosome_data__did_terminate", + "Ribosome Deactivations", + "Deactivations per timestep", + ) + ) + + if "activations_per_volume" in plot_df.columns: + plots.append( + create_line_chart( + "activations_per_volume", + "Activations per Volume (fL)", + "Activations per Volume (fL)", + ) + ) + + if "deactivations_per_volume" in plot_df.columns: + plots.append( + create_line_chart( + "deactivations_per_volume", + "Deactivations per Volume (fL)", + "Deactivations per Volume (fL)", + ) + ) + + if "listeners__ribosome_data__actual_elongations" in plot_df.columns: + plots.append( + create_line_chart( + "listeners__ribosome_data__actual_elongations", + "Amino Acids Translated", + "AA translated", + ) + ) + + if "listeners__ribosome_data__effective_elongation_rate" in plot_df.columns: + plots.append( + create_line_chart( + "listeners__ribosome_data__effective_elongation_rate", + "Effective Ribosome Elongation Rate", + "Effective elongation rate", + ) + ) + + if not plots: + fallback_df = pl.DataFrame( + { + "message": ["No data available for ribosome usage visualization"], + "x": [0], + "y": [0], + } + ) + fallback_plot = ( + alt.Chart(fallback_df.to_pandas()) + .mark_text(size=20, color="red") + .encode(x="x:Q", y="y:Q", text="message:N") + .properties( + width=600, + height=400, + title="Ribosome Usage Statistics - No Data Available", + ) + ) + plots.append(fallback_plot) + + # Arrange plots in 2 columns as in original + left_plots = plots[::2] # Even indices (0, 2, 4, ...) + right_plots = plots[1::2] # Odd indices (1, 3, 5, ...) + + # Ensure both columns have same length by adding empty chart if needed + if len(left_plots) > len(right_plots): + empty_chart = ( + alt.Chart(pl.DataFrame({"x": [0], "y": [0]}).to_pandas()) + .mark_point(opacity=0) + .encode(x="x:Q", y="y:Q") + .properties(width=600, height=120) + ) + right_plots.append(empty_chart) + elif len(right_plots) > len(left_plots): + empty_chart = ( + alt.Chart(pl.DataFrame({"x": [0], "y": [0]}).to_pandas()) + .mark_point(opacity=0) + .encode(x="x:Q", y="y:Q") + .properties(width=600, height=120) + ) + left_plots.append(empty_chart) + + # Create two column layout + left_column = alt.vconcat(*left_plots) + right_column = alt.vconcat(*right_plots) + combined_plot = ( + alt.hconcat(left_column, right_column) + .resolve_scale(x="shared", y="independent") + .properties(title="Ribosome Usage Statistics") + ) + + output_path = os.path.join(outdir, "ribosome_usage_report.html") + combined_plot.save(output_path) + print(f"Saved visualization to: {output_path}") + + return combined_plot diff --git a/ecoli/analysis/multigeneration/rna_decay_03_high.py b/ecoli/analysis/multigeneration/rna_decay_03_high.py index 4a77d0a9e..758ac0f4f 100644 --- a/ecoli/analysis/multigeneration/rna_decay_03_high.py +++ b/ecoli/analysis/multigeneration/rna_decay_03_high.py @@ -1,3 +1,18 @@ +""" +Plot dynamic traces of genes with high expression (> 20 counts of mRNA) + +EG10367_RNA 24.8 gapA Glyceraldehyde 3-phosphate dehydrogenase +EG11036_RNA 25.2 tufA Elongation factor Tu +EG50002_RNA 26.2 rpmA 50S Ribosomal subunit protein L27 +EG10671_RNA 30.1 ompF Outer membrane protein F +EG50003_RNA 38.7 acpP Apo-[acyl carrier protein] +EG10669_RNA 41.1 ompA Outer membrane protein A +EG10873_RNA 44.7 rplL 50S Ribosomal subunit protein L7/L12 dimer +EG12179_RNA 46.2 cspE Transcription antiterminator and regulator of RNA stability +EG10321_RNA 53.2 fliC Flagellin +EG10544_RNA 97.5 lpp Murein lipoprotein +""" + import altair as alt import os from typing import Any @@ -26,7 +41,7 @@ def plot( variant_metadata: dict[str, dict[int, Any]], variant_names: dict[str, str], ): - # Load sim_data for expected cistron order and degradation rates + """Plot dynamic traces of genes with high expression (> 20 counts of mRNA)""" with open_arbitrary_sim_data(sim_data_dict) as f: sim_data = pickle.load(f) cistron_array = sim_data.process.transcription.cistron_data.struct_array @@ -67,7 +82,7 @@ def plot( # Build named_idx structures deg_named = named_idx(deg_field, valid_ids, [deg_indices]) - cnt_named = named_idx(cnt_field, valid_ids, [cnt_indices]) + cnt_named = named_idx(cnt_field, [f"{i}_cnt" for i in valid_ids], [cnt_indices]) # Read stacked columns try: @@ -82,25 +97,27 @@ def plot( # Convert to Polars DataFrame df = pl.DataFrame(data_dict) - # Rename time and convert to minutes + # convert to minutes if "time" in df.columns: df = df.with_columns((pl.col("time") / 60).alias("time_min")) # Melt degradation and counts deg_cols = valid_ids - cnt_cols = valid_ids + cnt_cols = [f"{i}_cnt" for i in valid_ids] deg_df = df.select(["time_min"] + deg_cols).melt( "time_min", variable_name="cistron", value_name="degraded" ) - cnt_df = df.select(["time_min"] + cnt_cols).melt( - "time_min", variable_name="cistron", value_name="counts" + cnt_df = ( + df.select(["time_min"] + cnt_cols) + .melt("time_min", variable_name="cistron", value_name="counts") + .with_columns(pl.col("cistron").str.replace("_cnt", "", literal=True)) ) joined = deg_df.join(cnt_df, on=["time_min", "cistron"]) # Smooth and fit per cistron charts = [] window = 100 - for cid in valid_ids[:9]: # up to 9 plots + for cid in valid_ids[:9]: sub = joined.filter(pl.col("cistron") == cid).sort("time_min") if sub.height < 2 * window: continue @@ -129,13 +146,13 @@ def plot( line_x = np.linspace(A.min(), A.max(), 100) line_y = kdeg * line_x - # Scatter with blue points + # Scatter scatter = ( alt.Chart(plot_df) .mark_circle(size=20, opacity=0.6, color="blue") .encode(x="RNA_counts:Q", y="RNA_degraded:Q") ) - # Regression line with light yellow color + # Regression line line = ( alt.Chart(pl.DataFrame({"RNA_counts": line_x, "RNA_degraded": line_y})) .mark_line(color="red", strokeWidth=0.5) @@ -147,7 +164,6 @@ def plot( charts.append((scatter + line).properties(title=title, width=250, height=200)) if charts: - # Arrange charts in 3x3 grid rows = [alt.hconcat(*charts[i : i + 3]) for i in range(0, len(charts), 3)] combined = alt.vconcat(*rows).properties( title="RNA Decay - High Expression Genes" From 930e8afdc6f113d92b525ddffd1bdb4b4e239c8d Mon Sep 17 00:00:00 2001 From: Shu-Feather <2200017797@stu.pku.edu.cn> Date: Wed, 2 Jul 2025 07:20:29 +0800 Subject: [PATCH 28/71] Delete wrong-named files --- .../multigeneration/ribosomeProduction.py | 435 --------------- .../analysis/multigeneration/ribosomeUsage.py | 513 ------------------ 2 files changed, 948 deletions(-) delete mode 100644 ecoli/analysis/multigeneration/ribosomeProduction.py delete mode 100644 ecoli/analysis/multigeneration/ribosomeUsage.py diff --git a/ecoli/analysis/multigeneration/ribosomeProduction.py b/ecoli/analysis/multigeneration/ribosomeProduction.py deleted file mode 100644 index 82b94fb4a..000000000 --- a/ecoli/analysis/multigeneration/ribosomeProduction.py +++ /dev/null @@ -1,435 +0,0 @@ -""" -Record several things: -1. normalised dry mass over time -2. cell, 5S RNA, 16S RNA, and 23S rRNA doubling time (be calculated use the `log(2)` formulation) -3. 5S RNA, 16S RNA, and 23S rRNA initiation probability -4. Ribosome elongation rate -""" - -import altair as alt -import os -from typing import Any -import pickle -import polars as pl -import numpy as np -from duckdb import DuckDBPyConnection -import pandas as pd - -from ecoli.library.parquet_emitter import ( - open_arbitrary_sim_data, -) - -# ----------------------------------------- # - - -def make_get_bulk_counts(sim_data): - """ - Create a function to extract counts of specified bulk molecules using sim_data indices. - - Args: - sim_data: Simulation data object containing molecule IDs and related information. - - Returns: - A function that takes a DataFrame and list of molecule IDs and returns their total counts. - """ - # Get Molecular ID from bulk_molecules - try: - molecule_ids_list = sim_data.internal_state.bulk_molecules.bulk_data[ - "id" - ].tolist() - except AttributeError: - raise ValueError("[ERROR] Check the structure of `sim_data`") - - mol_id_to_index = {mol_id: idx for idx, mol_id in enumerate(molecule_ids_list)} - - def get_bulk_counts(df, molecule_ids): - """ - Extract total counts of specified molecule IDs from the 'bulk' column. - - Args: - df: Polars DataFrame with a 'bulk' column containing Series of counts. - molecule_ids: List of molecule IDs to sum (e.g., s30_16s_rRNA). - - Returns: - Polars Series with total counts for each row. - """ - indices = [] - for mol_id in molecule_ids: - if mol_id in mol_id_to_index: - indices.append(mol_id_to_index[mol_id]) - else: - print(f"warning: molecular ID '{mol_id}' is missing") - - return ( - df["bulk"] - .map_elements( - lambda counts_series: ( - sum(counts_series[i] for i in indices if i < len(counts_series)) - if isinstance(counts_series, pl.Series) - else 0 - ), - return_dtype=pl.Float64, - ) - .fill_null(0) - ) - - return get_bulk_counts - - -def get_unique_counts(df, molecule_type): - """Get counts of unique molecules (e.g., active ribosomes) from listeners.""" - col_name = f"listeners__unique_molecule_counts__{molecule_type}" - if col_name in df.columns: - return df[col_name].fill_null(0) - return pl.Series(np.zeros(len(df), dtype=np.int64)) - - -# Calculate the RNA doubling times -def calc_rna_doubling_time(produced_col, count_col, borderline): - production_rate = pl.col(produced_col) / pl.col("time_step_sec") - growth_rate = production_rate / pl.col(count_col) - doubling_time_min = np.log(2) / growth_rate / 60.0 - - # data sanitation - valid_condition = ( - (pl.col(produced_col) >= 0) - & (pl.col(count_col) > 0) - & (growth_rate > 0) - & doubling_time_min.is_finite() - & (doubling_time_min > 0) - & (doubling_time_min < 2 * borderline) - ) - - return pl.when(valid_condition).then(doubling_time_min).otherwise(pl.lit(None)) - - -# ----------------------------------------- # - - -def plot( - params: dict[str, Any], - conn: DuckDBPyConnection, - history_sql: str, - config_sql: str, - success_sql: str, - sim_data_dict: dict[str, dict[int, str]], - validation_data_paths: list[str], - outdir: str, - variant_metadata: dict[str, dict[int, Any]], - variant_names: dict[str, str], -): - """Visualize ribosome production metrics for E. coli simulation.""" - # Load sim_data - with open_arbitrary_sim_data(sim_data_dict) as f: - sim_data = pickle.load(f) - - # Get expected doubling time in minutes - sim_doubling_time_min = sim_data.doubling_time.asNumber() - - required_columns = [ - "time", - "variant", - "generation", - "agent_id", - "experiment_id", - "lineage_seed", - "listeners__mass__instantaneous_growth_rate", - "listeners__mass__dry_mass", - "listeners__ribosome_data__rRNA16S_initiated", - "listeners__ribosome_data__rRNA23S_initiated", - "listeners__ribosome_data__rRNA5S_initiated", - "listeners__ribosome_data__rRNA16S_init_prob", - "listeners__ribosome_data__rRNA23S_init_prob", - "listeners__ribosome_data__rRNA5S_init_prob", - "listeners__ribosome_data__total_rna_init", - "listeners__ribosome_data__effective_elongation_rate", - "listeners__unique_molecule_counts__active_ribosome", - "bulk", - ] - - s30_16s_rRNA = list(sim_data.molecule_groups.s30_16s_rRNA) + [ - sim_data.molecule_ids.s30_full_complex - ] - s50_23s_rRNA = list(sim_data.molecule_groups.s50_23s_rRNA) + [ - sim_data.molecule_ids.s50_full_complex - ] - s50_5s_rRNA = list(sim_data.molecule_groups.s50_5s_rRNA) + [ - sim_data.molecule_ids.s50_full_complex - ] - - # Check available columns - available_columns = ( - conn.sql(f"DESCRIBE ({history_sql})").pl()["column_name"].to_list() - ) - data_columns = [col for col in required_columns if col in available_columns] - - print( - f"[INFO] Loading {len(data_columns)} columns for ribosome production analysis" - ) - - df = conn.sql(f""" - SELECT {", ".join(data_columns)} - FROM ({history_sql}) - WHERE agent_id = 0 - ORDER BY variant, generation, time - """).pl() - df = df.rename({"variant": "variant_id", "generation": "generation_index"}) - - # Convert time from seconds to minutes - df = df.with_columns((pl.col("time") / 60).alias("time_min")) - - # Calculate mass doubling time - if "listeners__mass__instantaneous_growth_rate" in df.columns: - df = df.with_columns( - doubling_time_min=( - np.log(2) / pl.col("listeners__mass__instantaneous_growth_rate") - ) - / 60 - ) - - # Create get_bulk_counts function with sim_data - get_bulk_counts_func = make_get_bulk_counts(sim_data) - - # Calculate rRNA counts - df = df.with_columns( - [ - get_bulk_counts_func(df, s30_16s_rRNA).alias("bulk_16s_count"), - get_bulk_counts_func(df, s50_23s_rRNA).alias("bulk_23s_count"), - get_bulk_counts_func(df, s50_5s_rRNA).alias("bulk_5s_count"), - get_unique_counts(df, "active_ribosome").alias("ribosome_count"), - ] - ) - - # Total rRNA = bulk rRNA + rRNA in active ribosomes - df = df.with_columns( - [ - (pl.col("bulk_16s_count") + pl.col("ribosome_count")).alias("rrn16s_count"), - (pl.col("bulk_23s_count") + pl.col("ribosome_count")).alias("rrn23s_count"), - (pl.col("bulk_5s_count") + pl.col("ribosome_count")).alias("rrn5s_count"), - ] - ) - - # Calculate time step - df = df.with_columns( - pl.col("time") - .diff() - .over(["variant_id", "generation_index", "agent_id"]) - .alias("time_step_sec") - ) - df = df.with_columns( - time_step_sec=pl.when(pl.col("time_step_sec").is_null()) - .then(pl.col("time")) - .otherwise(pl.col("time_step_sec")) - ) - - if "listeners__ribosome_data__rRNA16S_initiated" in df.columns: - df = df.with_columns( - rrn16S_doubling_time_min=calc_rna_doubling_time( - "listeners__ribosome_data__rRNA16S_initiated", - "rrn16s_count", - sim_doubling_time_min, - ) - ) - if "listeners__ribosome_data__rRNA23S_initiated" in df.columns: - df = df.with_columns( - rrn23S_doubling_time_min=calc_rna_doubling_time( - "listeners__ribosome_data__rRNA23S_initiated", - "rrn23s_count", - sim_doubling_time_min, - ) - ) - if "listeners__ribosome_data__rRNA5S_initiated" in df.columns: - df = df.with_columns( - rrn5S_doubling_time_min=calc_rna_doubling_time( - "listeners__ribosome_data__rRNA5S_initiated", - "rrn5s_count", - sim_doubling_time_min, - ) - ) - - # Calculate initiation probabilities - if "listeners__ribosome_data__rRNA16S_init_prob" in df.columns: - df = df.with_columns( - rrn16S_init_prob_normalized=pl.col( - "listeners__ribosome_data__rRNA16S_init_prob" - ) - ) - if "listeners__ribosome_data__rRNA23S_init_prob" in df.columns: - df = df.with_columns( - rrn23S_init_prob_normalized=pl.col( - "listeners__ribosome_data__rRNA23S_init_prob" - ) - ) - if "listeners__ribosome_data__rRNA5S_init_prob" in df.columns: - df = df.with_columns( - rrn5S_init_prob_normalized=pl.col( - "listeners__ribosome_data__rRNA5S_init_prob" - ) - ) - - # Calculate expected initiation probabilities - condition = sim_data.condition - transcription = sim_data.process.transcription - cistron_synth_prob = transcription.cistron_tu_mapping_matrix.dot( - transcription.rna_synth_prob[condition] - ) - - def get_cistron_prob(ids): - indices = [] - for rna_id in ids: - cistron_id = rna_id[:-3] # Remove RNA suffix - idx = np.where(transcription.cistron_data["id"] == cistron_id)[0] - if len(idx) > 0: - indices.append(idx[0]) - return cistron_synth_prob[indices].sum() if indices else 0.0 - - rrn16s_fit_init_prob = get_cistron_prob(sim_data.molecule_groups.s30_16s_rRNA) - rrn23s_fit_init_prob = get_cistron_prob(sim_data.molecule_groups.s50_23s_rRNA) - rrn5s_fit_init_prob = get_cistron_prob(sim_data.molecule_groups.s50_5s_rRNA) - - # Select columns for plotting - plot_columns = ["time_min", "variant_id", "generation_index"] - - # Add other columns - for col in [ - "listeners__mass__dry_mass", - "doubling_time_min", - "rrn16S_doubling_time_min", - "rrn23S_doubling_time_min", - "rrn5S_doubling_time_min", - "rrn16S_init_prob_normalized", - "rrn23S_init_prob_normalized", - "rrn5S_init_prob_normalized", - "listeners__ribosome_data__effective_elongation_rate", - ]: - if col in df.columns: - plot_columns.append(col) - - plot_df = df.select(plot_columns) - - # Calculate initial dry mass at time=0 for each variant and generation - initial_dry_mass = ( - plot_df.filter(pl.col("time_min") == 0) - .select(["variant_id", "listeners__mass__dry_mass"]) - .rename({"listeners__mass__dry_mass": "initial_dry_mass"}) - ) - - plot_df = plot_df.join(initial_dry_mass, on=["variant_id"], how="left") - - plot_df = plot_df.with_columns( - (pl.col("listeners__mass__dry_mass") / pl.col("initial_dry_mass")).alias( - "dry_mass_normalized" - ) - ) - - # ----------------------------------------- # - - def create_line_chart(y_field, title, y_title, reference=None): - base = alt.Chart(plot_df.to_pandas()) - line = base.mark_line().encode( - x=alt.X("time_min:Q", title="Time (min)"), - y=alt.Y(f"{y_field}:Q", title=y_title), - color=alt.Color("variant_id:N", legend=alt.Legend(title="Variant")), - ) - chart = line.properties(title=title, width=600, height=120) - if reference is not None: - ref_line = ( - alt.Chart(pd.DataFrame({"y": [reference]})) - .mark_rule(color="red", strokeDash=[5, 5]) - .encode(y="y:Q") - ) - return chart + ref_line - return chart - - # ----------------------------------------- # - plots = [] - - if "dry_mass_normalized" in plot_df.columns: - plots.append( - create_line_chart( - "dry_mass_normalized", - "Normalized Dry Mass Over Time", - "Dry mass (relative to t=0)", - ) - ) - - if "doubling_time_min" in plot_df.columns: - plots.append( - create_line_chart( - "doubling_time_min", - "Cell Doubling Time", - "Doubling Time (min)", - sim_doubling_time_min, - ) - ) - - rna_types = ["16S", "23S", "5S"] - for rna in rna_types: - col_name = f"rrn{rna}_doubling_time_min" - if col_name in plot_df.columns: - plots.append( - create_line_chart( - col_name, - f"{rna} rRNA Doubling Time", - "Doubling Time (min)", - sim_doubling_time_min, - ) - ) - - init_probs = { - "16S": rrn16s_fit_init_prob, - "23S": rrn23s_fit_init_prob, - "5S": rrn5s_fit_init_prob, - } - for rna, ref_prob in init_probs.items(): - col_name = f"rrn{rna}_init_prob_normalized" - if col_name in plot_df.columns: - plots.append( - create_line_chart( - col_name, - f"{rna} rRNA Initiation Probability", - "Probability", - ref_prob, - ) - ) - - if "listeners__ribosome_data__effective_elongation_rate" in plot_df.columns: - plots.append( - create_line_chart( - "listeners__ribosome_data__effective_elongation_rate", - "Ribosome Elongation Rate", - "Amino acids/s", - ) - ) - - if not plots: - fallback_df = pl.DataFrame( - { - "message": ["No data available for ribosome production visualization"], - "x": [0], - "y": [0], - } - ) - fallback_plot = ( - alt.Chart(fallback_df.to_pandas()) - .mark_text(size=20, color="red") - .encode(x="x:Q", y="y:Q", text="message:N") - .properties( - width=600, - height=400, - title="Ribosome Production Metrics - No Data Available", - ) - ) - plots.append(fallback_plot) - - combined_plot = ( - alt.vconcat(*plots) - .resolve_scale(x="shared", y="independent") - .properties(title="Ribosome Production Metrics") - ) - - output_path = os.path.join(outdir, "ribosome_production_report.html") - combined_plot.save(output_path) - print(f"Saved visualization to: {output_path}") - - return combined_plot diff --git a/ecoli/analysis/multigeneration/ribosomeUsage.py b/ecoli/analysis/multigeneration/ribosomeUsage.py deleted file mode 100644 index 6b277b248..000000000 --- a/ecoli/analysis/multigeneration/ribosomeUsage.py +++ /dev/null @@ -1,513 +0,0 @@ -""" -Record several things: -1. cell volume over time -2. total / active ribosome count and concentration -3. active ribosome molar / mass fraction -4. Ribosome activation / deactivation count -5. # of AA. be translated -6. the effective ribosome elongation rate -""" - -import altair as alt -import os -from typing import Any -import pickle - -import polars as pl -import numpy as np -from duckdb import DuckDBPyConnection -import pandas as pd - -from ecoli.library.parquet_emitter import ( - open_arbitrary_sim_data, -) - -# ----------------------------------------- # - - -def make_get_bulk_counts(sim_data): - """ - Create a function to extract counts of specified bulk molecules using sim_data indices. - - Args: - sim_data: Simulation data object containing molecule IDs and related information. - - Returns: - A function that takes a DataFrame and list of molecule IDs and returns their total counts. - """ - # Get Molecular ID from bulk_molecules - try: - molecule_ids_list = sim_data.internal_state.bulk_molecules.bulk_data[ - "id" - ].tolist() - except AttributeError: - raise ValueError("[ERROR] Check the structure of `sim_data`") - - mol_id_to_index = {mol_id: idx for idx, mol_id in enumerate(molecule_ids_list)} - - def get_bulk_counts(df, molecule_ids): - """ - Extract total counts of specified molecule IDs from the 'bulk' column. - - Args: - df: Polars DataFrame with a 'bulk' column containing Series of counts. - molecule_ids: List of molecule IDs to sum. - - Returns: - Polars Series with total counts for each row. - """ - indices = [] - for mol_id in molecule_ids: - if mol_id in mol_id_to_index: - indices.append(mol_id_to_index[mol_id]) - else: - print(f"warning: molecular ID '{mol_id}' is missing") - - return ( - df["bulk"] - .map_elements( - lambda counts_series: ( - sum(counts_series[i] for i in indices if i < len(counts_series)) - if isinstance(counts_series, pl.Series) - else 0 - ), - return_dtype=pl.Float64, - ) - .fill_null(0) - ) - - return get_bulk_counts - - -def get_unique_counts(df, molecule_type): - """Get counts of unique molecules from listeners.""" - col_name = f"listeners__unique_molecule_counts__{molecule_type}" - if col_name in df.columns: - return df[col_name].fill_null(0) - return pl.Series(np.zeros(len(df), dtype=np.int64)) - - -# ----------------------------------------- # - - -def plot( - params: dict[str, Any], - conn: DuckDBPyConnection, - history_sql: str, - config_sql: str, - success_sql: str, - sim_data_dict: dict[str, dict[int, str]], - validation_data_paths: list[str], - outdir: str, - variant_metadata: dict[str, dict[int, Any]], - variant_names: dict[str, str], -): - """Visualize ribosome usage statistics for E. coli simulation.""" - # ----------------------------------------- # - - fg = 1e-15 - fl = 1e-15 - - # ----------------------------------------- # - # Load sim_data - with open_arbitrary_sim_data(sim_data_dict) as f: - sim_data = pickle.load(f) - - required_columns = [ - "time", - "variant", - "generation", - "agent_id", - "experiment_id", - "lineage_seed", - "listeners__mass__instantaneous_growth_rate", - "listeners__mass__cell_mass", - "listeners__mass__volume", - "listeners__ribosome_data__did_initialize", - "listeners__ribosome_data__actual_elongations", - "listeners__ribosome_data__did_terminate", - "listeners__ribosome_data__effective_elongation_rate", - "listeners__unique_molecule_counts__active_ribosome", - "bulk", - ] - - # Get molecular IDs for ribosome subunits - complex_ids_30s = [sim_data.molecule_ids.s30_full_complex] - complex_ids_50s = [sim_data.molecule_ids.s50_full_complex] - - # Get molecular weights - n_avogadro = sim_data.constants.n_avogadro - mw_30s = sim_data.getter.get_masses(complex_ids_30s) - mw_50s = sim_data.getter.get_masses(complex_ids_50s) - mw_70s = mw_30s + mw_50s - - # Check available columns - available_columns = ( - conn.sql(f"DESCRIBE ({history_sql})").pl()["column_name"].to_list() - ) - data_columns = [col for col in required_columns if col in available_columns] - - print(f"[INFO] Loading {len(data_columns)} columns for ribosome usage analysis") - - df = conn.sql(f""" - SELECT {", ".join(data_columns)} - FROM ({history_sql}) - WHERE agent_id = 0 - ORDER BY variant, generation, time - """).pl() - df = df.rename({"variant": "variant_id", "generation": "generation_index"}) - - # Convert time from seconds to minutes - df = df.with_columns((pl.col("time") / 60).alias("time_min")) - - # Create get_bulk_counts function with sim_data - get_bulk_counts_func = make_get_bulk_counts(sim_data) - - # Calculate ribosome subunit counts - df = df.with_columns( - [ - get_bulk_counts_func(df, complex_ids_30s).alias("counts_30s"), - get_bulk_counts_func(df, complex_ids_50s).alias("counts_50s"), - get_unique_counts(df, "active_ribosome").alias("active_ribosome_counts"), - ] - ) - - # Calculate total ribosome counts and fractions - df = df.with_columns( - [ - ( - pl.col("active_ribosome_counts") - + pl.min_horizontal(pl.col("counts_30s"), pl.col("counts_50s")) - ).alias("total_ribosome_counts"), - ( - pl.col("active_ribosome_counts").cast(pl.Float64) - / ( - pl.col("active_ribosome_counts") - + pl.min_horizontal(pl.col("counts_30s"), pl.col("counts_50s")) - ) - ).alias("molar_fraction_active"), - ] - ) - - if "listeners__mass__cell_mass" in df.columns: - cell_density = sim_data.constants.cell_density.asNumber() - df = df.with_columns( - (fg * pl.col("listeners__mass__cell_mass") / cell_density).alias( - "cell_volume" - ) - ) - - # Calculate concentrations - df = df.with_columns( - [ - ( - pl.col("total_ribosome_counts") - / n_avogadro.asNumber() - / pl.col("cell_volume") - ).alias("total_ribosome_concentration_mM"), - ( - pl.col("active_ribosome_counts") - / n_avogadro.asNumber() - / pl.col("cell_volume") - ).alias("active_ribosome_concentration_mM"), - ] - ) - - # Calculate masses - mw_30s_value = mw_30s.asNumber() if hasattr(mw_30s, "asNumber") else float(mw_30s) - mw_50s_value = mw_50s.asNumber() if hasattr(mw_50s, "asNumber") else float(mw_50s) - mw_70s_value = mw_70s.asNumber() if hasattr(mw_70s, "asNumber") else float(mw_70s) - - df = df.with_columns( - [ - (pl.col("counts_30s") / n_avogadro.asNumber() * mw_30s_value).alias( - "mass_30s" - ), - (pl.col("counts_50s") / n_avogadro.asNumber() * mw_50s_value).alias( - "mass_50s" - ), - ( - pl.col("active_ribosome_counts") / n_avogadro.asNumber() * mw_70s_value - ).alias("active_ribosome_mass"), - ] - ) - - df = df.with_columns( - [ - ( - pl.col("active_ribosome_mass") + pl.col("mass_30s") + pl.col("mass_50s") - ).alias("total_ribosome_mass"), - ( - pl.col("active_ribosome_mass") - / ( - pl.col("active_ribosome_mass") - + pl.col("mass_30s") - + pl.col("mass_50s") - ) - ).alias("mass_fraction_active"), - ] - ) - - if "time" in df.columns: - df = df.with_columns([(pl.col("time") + 1).alias("time_step_sec")]) - - # Calculate rates per time and volume - # if "time_step_sec" in df.columns and "cell_volume" in df.columns: - # df = df.with_columns([ - # (pl.col("listeners__ribosome_data__did_initialize") / - # (pl.col("time_step_sec") * pl.col("cell_volume"))).alias("activations_per_time_volume"), - # (pl.col("listeners__ribosome_data__did_terminate") / - # (pl.col("time_step_sec") * pl.col("cell_volume"))).alias("deactivations_per_time_volume") - # ]) - - if "time_step_sec" in df.columns and "cell_volume" in df.columns: - df = df.with_columns( - [ - ( - pl.col("listeners__ribosome_data__did_initialize") - / (pl.col("cell_volume") / fl) - ).alias("activations_per_volume"), - ( - pl.col("listeners__ribosome_data__did_terminate") - / (pl.col("cell_volume") / fl) - ).alias("deactivations_per_volume"), - ] - ) - - # Select columns for plotting - plot_columns = ["time_min", "variant_id", "generation_index"] - - # Add other columns that exist - for col in [ - "time_step_sec", - "cell_volume", - "total_ribosome_counts", - "total_ribosome_concentration_mM", - "active_ribosome_counts", - "active_ribosome_concentration_mM", - "molar_fraction_active", - "mass_fraction_active", - "listeners__ribosome_data__did_initialize", - "listeners__ribosome_data__did_terminate", - "activations_per_volume", - "deactivations_per_volume", - "listeners__ribosome_data__actual_elongations", - "listeners__ribosome_data__effective_elongation_rate", - ]: - if col in df.columns: - plot_columns.append(col) - - plot_df = df.select(plot_columns) - - # ----------------------------------------- # - - def create_line_chart(y_field, title, y_title, skip_first_point=False): - """Create line chart with optional skipping of first data point.""" - data = plot_df.to_pandas() - if skip_first_point: - # Group by variant and generation, skip first point of each group - filtered_data = [] - for (variant_id, generation_index), group in data.groupby( - ["variant_id", "generation_index"] - ): - if len(group) > 1: - filtered_data.append(group.iloc[1:]) - else: - filtered_data.append(group) - data = ( - pd.concat(filtered_data, ignore_index=True) if filtered_data else data - ) - - chart = ( - alt.Chart(data) - .mark_line() - .encode( - x=alt.X("time_min:Q", title="Time (min)"), - y=alt.Y(f"{y_field}:Q", title=y_title), - color=alt.Color("variant_id:N", legend=alt.Legend(title="Variant")), - ) - .properties(title=title, width=600, height=120) - ) - - return chart - - # ----------------------------------------- # - plots = [] - - # Create all 14 plots following the original order - if "time_step_sec" in plot_df.columns: - plots.append( - create_line_chart( - "time_step_sec", "Length of Time Step", "Length of time step (s)" - ) - ) - - if "cell_volume" in plot_df.columns: - plots.append(create_line_chart("cell_volume", "Cell Volume", "Cell volume (L)")) - - if "total_ribosome_counts" in plot_df.columns: - plots.append( - create_line_chart( - "total_ribosome_counts", "Total Ribosome Count", "Total ribosome count" - ) - ) - - if "total_ribosome_concentration_mM" in plot_df.columns: - plots.append( - create_line_chart( - "total_ribosome_concentration_mM", - "Total Ribosome Concentration", - "[Total ribosome] (mM)", - ) - ) - - if "active_ribosome_counts" in plot_df.columns: - plots.append( - create_line_chart( - "active_ribosome_counts", - "Active Ribosome Count", - "Active ribosome count", - skip_first_point=True, - ) - ) - - if "active_ribosome_concentration_mM" in plot_df.columns: - plots.append( - create_line_chart( - "active_ribosome_concentration_mM", - "Active Ribosome Concentration", - "[Active ribosome] (mM)", - skip_first_point=True, - ) - ) - - if "molar_fraction_active" in plot_df.columns: - plots.append( - create_line_chart( - "molar_fraction_active", - "Molar Fraction Active Ribosomes", - "Molar fraction active ribosomes", - skip_first_point=True, - ) - ) - - if "mass_fraction_active" in plot_df.columns: - plots.append( - create_line_chart( - "mass_fraction_active", - "Mass Fraction Active Ribosomes", - "Mass fraction active ribosomes", - skip_first_point=True, - ) - ) - - if "listeners__ribosome_data__did_initialize" in plot_df.columns: - plots.append( - create_line_chart( - "listeners__ribosome_data__did_initialize", - "Ribosome Activations", - "Activations per timestep", - ) - ) - - if "listeners__ribosome_data__did_terminate" in plot_df.columns: - plots.append( - create_line_chart( - "listeners__ribosome_data__did_terminate", - "Ribosome Deactivations", - "Deactivations per timestep", - ) - ) - - if "activations_per_volume" in plot_df.columns: - plots.append( - create_line_chart( - "activations_per_volume", - "Activations per Volume (fL)", - "Activations per Volume (fL)", - ) - ) - - if "deactivations_per_volume" in plot_df.columns: - plots.append( - create_line_chart( - "deactivations_per_volume", - "Deactivations per Volume (fL)", - "Deactivations per Volume (fL)", - ) - ) - - if "listeners__ribosome_data__actual_elongations" in plot_df.columns: - plots.append( - create_line_chart( - "listeners__ribosome_data__actual_elongations", - "Amino Acids Translated", - "AA translated", - ) - ) - - if "listeners__ribosome_data__effective_elongation_rate" in plot_df.columns: - plots.append( - create_line_chart( - "listeners__ribosome_data__effective_elongation_rate", - "Effective Ribosome Elongation Rate", - "Effective elongation rate", - ) - ) - - if not plots: - fallback_df = pl.DataFrame( - { - "message": ["No data available for ribosome usage visualization"], - "x": [0], - "y": [0], - } - ) - fallback_plot = ( - alt.Chart(fallback_df.to_pandas()) - .mark_text(size=20, color="red") - .encode(x="x:Q", y="y:Q", text="message:N") - .properties( - width=600, - height=400, - title="Ribosome Usage Statistics - No Data Available", - ) - ) - plots.append(fallback_plot) - - # Arrange plots in 2 columns as in original - left_plots = plots[::2] # Even indices (0, 2, 4, ...) - right_plots = plots[1::2] # Odd indices (1, 3, 5, ...) - - # Ensure both columns have same length by adding empty chart if needed - if len(left_plots) > len(right_plots): - empty_chart = ( - alt.Chart(pl.DataFrame({"x": [0], "y": [0]}).to_pandas()) - .mark_point(opacity=0) - .encode(x="x:Q", y="y:Q") - .properties(width=600, height=120) - ) - right_plots.append(empty_chart) - elif len(right_plots) > len(left_plots): - empty_chart = ( - alt.Chart(pl.DataFrame({"x": [0], "y": [0]}).to_pandas()) - .mark_point(opacity=0) - .encode(x="x:Q", y="y:Q") - .properties(width=600, height=120) - ) - left_plots.append(empty_chart) - - # Create two column layout - left_column = alt.vconcat(*left_plots) - right_column = alt.vconcat(*right_plots) - combined_plot = ( - alt.hconcat(left_column, right_column) - .resolve_scale(x="shared", y="independent") - .properties(title="Ribosome Usage Statistics") - ) - - output_path = os.path.join(outdir, "ribosome_usage_report.html") - combined_plot.save(output_path) - print(f"Saved visualization to: {output_path}") - - return combined_plot From 433e632ecee10408465c38e8d41a1c596610cd3c Mon Sep 17 00:00:00 2001 From: Shu-Feather <2200017797@stu.pku.edu.cn> Date: Wed, 2 Jul 2025 07:21:52 +0800 Subject: [PATCH 29/71] Add multigeneration analysis into ecoli-glucose-minimal simulation --- runscripts/jenkins/configs/ecoli-glucose-minimal.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/runscripts/jenkins/configs/ecoli-glucose-minimal.json b/runscripts/jenkins/configs/ecoli-glucose-minimal.json index a90b7983d..a00e86ffc 100644 --- a/runscripts/jenkins/configs/ecoli-glucose-minimal.json +++ b/runscripts/jenkins/configs/ecoli-glucose-minimal.json @@ -12,7 +12,8 @@ "analysis_options": { "single": {"mass_fraction_summary": {}}, "multiseed": {"protein_counts_validation": {}}, - "multivariant": {"doubling_time_hist": {"skip_n_gens": 0}, "doubling_time_line": {}} + "multivariant": {"doubling_time_hist": {"skip_n_gens": 0}, "doubling_time_line": {}}, + "multigeneration": {"replication": {}, "ribosome_usage": {}} }, "sherlock": { "container_image": "container-image", From d3bf0db7b3a0b9bf4b5e84be26bb9ef415a51328 Mon Sep 17 00:00:00 2001 From: Shu-Feather <2200017797@stu.pku.edu.cn> Date: Wed, 2 Jul 2025 07:34:01 +0800 Subject: [PATCH 30/71] Modification for array --- ecoli/analysis/multigeneration/ribosome_production.py | 6 +++--- ecoli/analysis/multigeneration/ribosome_usage.py | 9 +++++++-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/ecoli/analysis/multigeneration/ribosome_production.py b/ecoli/analysis/multigeneration/ribosome_production.py index f42e1e4b9..c2c2aeb9a 100644 --- a/ecoli/analysis/multigeneration/ribosome_production.py +++ b/ecoli/analysis/multigeneration/ribosome_production.py @@ -65,9 +65,9 @@ def plot( bulk_ids = sim_data.internal_state.bulk_molecules.bulk_data["id"].tolist() # precompute indices as Python ints - idx_16s = [int(i) for i in bulk_name_to_idx(s30_16s, bulk_ids)] - idx_23s = [int(i) for i in bulk_name_to_idx(s50_23s, bulk_ids)] - idx_5s = [int(i) for i in bulk_name_to_idx(s50_5s, bulk_ids)] + idx_16s = [int(i) for i in np.atleast_1d(bulk_name_to_idx(s30_16s, bulk_ids))] + idx_23s = [int(i) for i in np.atleast_1d(bulk_name_to_idx(s50_23s, bulk_ids))] + idx_5s = [int(i) for i in np.atleast_1d(bulk_name_to_idx(s50_5s, bulk_ids))] required_columns = [ "time", diff --git a/ecoli/analysis/multigeneration/ribosome_usage.py b/ecoli/analysis/multigeneration/ribosome_usage.py index 184090134..bef7193a1 100644 --- a/ecoli/analysis/multigeneration/ribosome_usage.py +++ b/ecoli/analysis/multigeneration/ribosome_usage.py @@ -16,6 +16,7 @@ import polars as pl from duckdb import DuckDBPyConnection import pandas as pd +import numpy as np from ecoli.library.parquet_emitter import ( open_arbitrary_sim_data, @@ -48,8 +49,12 @@ def plot( bulk_ids = sim_data.internal_state.bulk_molecules.bulk_data["id"].tolist() # precompute indices as Python ints (following ribosome_production.py pattern) - idx_30s = [int(i) for i in bulk_name_to_idx(complex_ids_30s, bulk_ids)] - idx_50s = [int(i) for i in bulk_name_to_idx(complex_ids_50s, bulk_ids)] + idx_30s = [ + int(i) for i in np.atleast_1d(bulk_name_to_idx(complex_ids_30s, bulk_ids)) + ] + idx_50s = [ + int(i) for i in np.atleast_1d(bulk_name_to_idx(complex_ids_50s, bulk_ids)) + ] # Get molecular weights n_avogadro = sim_data.constants.n_avogadro From 73dd221b176455dab376b6d4d1748dc93b293d30 Mon Sep 17 00:00:00 2001 From: Sean Cheah Date: Wed, 2 Jul 2025 10:40:33 -0700 Subject: [PATCH 31/71] More polished pip-audit action --- .github/workflows/pip_audit.yml | 322 ++------------------ runscripts/debug/process_vulnerabilities.py | 185 +++++++++++ 2 files changed, 205 insertions(+), 302 deletions(-) create mode 100644 runscripts/debug/process_vulnerabilities.py diff --git a/.github/workflows/pip_audit.yml b/.github/workflows/pip_audit.yml index 4076efbcd..fbfb9df11 100644 --- a/.github/workflows/pip_audit.yml +++ b/.github/workflows/pip_audit.yml @@ -17,7 +17,7 @@ concurrency: cancel-in-progress: true jobs: - build: + audit: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -28,317 +28,35 @@ jobs: enable-cache: true version: "0.7.12" - - name: Configure Git - run: | - git config --global user.name 'github-actions[bot]' - git config --global user.email 'github-actions[bot]@users.noreply.github.com' - - name: Audit dependencies and identify vulnerabilities - id: audit run: | # Export requirements for pip-audit to analyze uv export --all-extras --format requirements-txt --no-emit-project > requirements.txt # Run pip-audit but don't fail if vulnerabilities are found - uvx pip-audit -r requirements.txt --disable-pip -v > pip_audit_results.txt || true - - # Check if vulnerabilities were found - if [ ! -s pip_audit_results.txt ]; then - echo "has_vulnerabilities=false" >> $GITHUB_OUTPUT - else - echo "has_vulnerabilities=true" >> $GITHUB_OUTPUT - - # Create a detailed mapping of all vulnerabilities for later use - { - # Add a header row for the CSV format - echo "pkg_name,current_ver,vuln_id,fixed_ver" - - # Extract all vulnerabilities with their details - grep -v "^Name\|^------" pip_audit_results.txt | while read -r line; do - if [[ -n "$line" ]]; then - # Extract fields: package name, current version, vulnerability ID, fixed version - pkg_name=$(echo "$line" | awk '{print $1}') - current_ver=$(echo "$line" | awk '{print $2}') - vuln_id=$(echo "$line" | awk '{print $3}') - fixed_ver=$(echo "$line" | awk '{print $NF}') - - # Output as CSV - echo "$pkg_name,$current_ver,$vuln_id,$fixed_ver" - fi - done - } > all_vulnerabilities.csv - - # Store all_vulnerabilities.csv as an artifact - echo "all_vulns_data<> $GITHUB_OUTPUT - cat all_vulnerabilities.csv >> $GITHUB_OUTPUT - echo "EOF" >> $GITHUB_OUTPUT - - # Get unique packages with their highest fixed version - { - echo "Processing unique packages with highest fixed versions:" - - # Use awk to process the CSV and find highest versions - awk -F, 'BEGIN {OFS=","} - # Custom function for semantic version comparison - function version_gt(v1, v2) { - n1 = split(v1, a, "[.-]") - n2 = split(v2, b, "[.-]") - - # Compare each version component - for (i = 1; i <= n1 && i <= n2; i++) { - if (a[i] == b[i]) continue - return (a[i]+0) > (b[i]+0) - } - return n1 > n2 - } - NR == 1 {next} # Skip header - { - pkg = $1 - curr_ver = $2 - vuln = $3 - fix_ver = $4 - - print "Found=" pkg, "current=" curr_ver, "vuln=" vuln, "fix=" fix_ver - - # Check if we have seen this package before - if (!(pkg in highest_ver) || version_gt(fix_ver, highest_ver[pkg])) { - highest_ver[pkg] = fix_ver - print " Updated highest version for", pkg, "to", fix_ver - } - } - END { - # Output unique packages with highest versions - for (pkg in highest_ver) { - print pkg "==" highest_ver[pkg] - } - }' all_vulnerabilities.csv - } > unique_packages.txt - - # Store the consolidated package list - consolidated_packages=$(cat unique_packages.txt | grep -v "^Processing\|^Found\|^ Updated" | sort) - echo "vulnerable_packages<> $GITHUB_OUTPUT - echo "$consolidated_packages" >> $GITHUB_OUTPUT - echo "EOF" >> $GITHUB_OUTPUT - fi - - - name: Process vulnerable packages individually - if: steps.audit.outputs.has_vulnerabilities == 'true' - id: process_packages - run: | - # Build the JSON array in a variable first - json_data="[" - first_item=true - - # Store all vulnerability data for reference - all_vulns="${{ steps.audit.outputs.all_vulns_data }}" - - while IFS= read -r line; do - if [[ -n "$line" && $line =~ ([^=]+)==(.+) ]]; then - pkg_name="${BASH_REMATCH[1]}" - pkg_version="${BASH_REMATCH[2]}" - - echo "Processing package: $pkg_name -> $pkg_version" - - # Get current version from the first vulnerability entry - current_ver=$(echo "$all_vulns" | grep -m 1 "^$pkg_name," | cut -d',' -f2) - - # Get all vulnerability IDs for this package - vuln_ids=$(echo "$all_vulns" | grep "^$pkg_name," | cut -d',' -f3 | sort -u | paste -sd "," -) - - # Create signature specific to this package - pkg_signature=$(echo "$pkg_name-$pkg_version" | md5sum | cut -d ' ' -f1) - - echo " Current version: $current_ver" - echo " Vulnerabilities: $vuln_ids" - echo " Signature: $pkg_signature" - - # Add to JSON (with comma if not first) - if [ "$first_item" = "true" ]; then - first_item=false - else - json_data+="," - fi - - # Escape any special characters in the values - pkg_name_esc=$(echo "$pkg_name" | jq -R .) - pkg_version_esc=$(echo "$pkg_version" | jq -R .) - current_ver_esc=$(echo "$current_ver" | jq -R .) - vuln_ids_esc=$(echo "$vuln_ids" | jq -R .) - - # Build the JSON object with proper escaping - json_data+="{\"name\":${pkg_name_esc},\"version\":${pkg_version_esc},\"current_version\":${current_ver_esc},\"vuln_id\":${vuln_ids_esc},\"signature\":\"$pkg_signature\"}" - fi - done <<< "${{ steps.audit.outputs.vulnerable_packages }}" - - # Close the JSON array - json_data+="]" - - # Use the multiline delimiter syntax for GitHub Actions outputs - echo "package_data<> $GITHUB_OUTPUT - echo "$json_data" >> $GITHUB_OUTPUT - echo "EOF" >> $GITHUB_OUTPUT - - outputs: - has_vulnerabilities: ${{ steps.audit.outputs.has_vulnerabilities }} - package_data: ${{ steps.process_packages.outputs.package_data }} - all_vulns_data: ${{ steps.audit.outputs.all_vulns_data }} - - update_packages: - needs: build - if: needs.build.outputs.has_vulnerabilities == 'true' - runs-on: ubuntu-latest - strategy: - matrix: - package: ${{ fromJSON(needs.build.outputs.package_data) }} - # Allow other package updates to continue if one fails - fail-fast: false - # Limit concurrent jobs to avoid API rate limits - max-parallel: 5 - - steps: - - uses: actions/checkout@v4 - - - name: Set up uv - uses: astral-sh/setup-uv@v6 - with: - enable-cache: true - version: "0.7.12" - - - name: Check for existing PRs - id: check_prs - run: | - # Check for existing PRs with this package name - pkg_name="${{ matrix.package.name }}" - existing_pr=$(gh pr list --json number,title,body --search "in:title security update for $pkg_name" --jq '.[0]') - - if [[ -n "$existing_pr" ]]; then - pr_number=$(echo "$existing_pr" | jq -r '.number') - echo "Found existing PR #$pr_number for $pkg_name" - - # Check if PR contains an older version of the same package - pr_body=$(echo "$existing_pr" | jq -r '.body') - if echo "$pr_body" | grep -q "Package signature: ${{ matrix.package.signature }}"; then - echo "Found PR with identical package version - skipping" - echo "skip_pr_creation=true" >> $GITHUB_OUTPUT - exit 0 - fi - - # PR exists but for a different version - we'll close it and create new one - echo "PR exists for different version - will close and create new PR" - gh pr close $pr_number --comment "Closing in favor of PR with newer version ${pkg_name}==${matrix.package.version}" - fi - - echo "Will create new PR for ${pkg_name}==${{ matrix.package.version }}" - echo "skip_pr_creation=false" >> $GITHUB_OUTPUT - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: Configure Git - run: | - git config --global user.name 'github-actions[bot]' - git config --global user.email 'github-actions[bot]@users.noreply.github.com' - - - name: Update package - if: steps.check_prs.outputs.skip_pr_creation == 'false' - id: update - continue-on-error: true # Continue to cleanup step even if this fails - run: | - # Create a unique branch name for this package - branch_name="security-update-${{ matrix.package.name }}-${{ github.run_id }}" - echo "branch_name=$branch_name" >> $GITHUB_OUTPUT - - # Ensure we're on master and it's up-to-date - git fetch origin master - git checkout master - git pull origin master - - # Create new branch for this package only - git checkout -b $branch_name - - echo "Setting up uv environment..." - uv sync --frozen --all-extras - - # Update only this specific package in the lock file - echo "Updating ${{ matrix.package.name }} to ${{ matrix.package.version }}" - uv lock --upgrade-package "${{ matrix.package.name }}==${{ matrix.package.version }}" - - # Verify changes were made - if git diff --quiet uv.lock; then - echo "No changes detected in uv.lock file. This might indicate an issue with the update process." - exit 1 - fi - - # Commit changes - git add uv.lock - git commit -m "fix(security): update ${{ matrix.package.name }} to ${{ matrix.package.version }}" - - # Push to the remote branch - git push origin $branch_name + uvx pip-audit -r requirements.txt --disable-pip --desc off --format json > pip_audit_results.txt || true - - name: Create package-specific PR report with all vulnerabilities - if: steps.check_prs.outputs.skip_pr_creation == 'false' && steps.update.outcome == 'success' - id: create_report + - name: Process audit information run: | - # Get all vulnerability details for this package from the CSV - all_vulns="${{ needs.build.outputs.all_vulns_data }}" - - # Create PR description with comprehensive vulnerability information - { - echo "# Security Update: ${{ matrix.package.name }}" - echo "" - echo "This PR updates **${{ matrix.package.name }}** from version ${{ matrix.package.current_version }} to **${{ matrix.package.version }}** to fix the following security vulnerabilities:" - echo "" - - # List all vulnerabilities for this package - echo "## Vulnerability Details" - echo "" - echo "| Vulnerability ID | Affected Version | Fixed Version |" - echo "| --------------- | --------------- | ------------ |" - - # Parse the CSV data to extract vulnerabilities for this package - echo "$all_vulns" | grep -v "^pkg_name" | grep "^${{ matrix.package.name }}," | while IFS=, read -r pkg curr_ver vuln_id fixed_ver; do - # If the vulnerability is fixed by the version we're updating to, include it - echo "| $vuln_id | $curr_ver | $fixed_ver |" - done + # Avoid downloading and installing entire project and all dependencies + uv run --no-sync --isolated --with packaging runscripts/debug/process_vulnerabilities.py pip_audit_results.txt - echo "" - echo "Close and reopen this PR to trigger the CI/CD pipelines before merging." - echo "" + - name: Apply package updates + run: | + ./apply_security_upgrades.sh - echo "" - echo "" - } > pr_description.md - - cat pr_description.md - - name: Create Pull Request - if: steps.check_prs.outputs.skip_pr_creation == 'false' && steps.update.outcome == 'success' - id: create_pr - continue-on-error: true - run: | - gh pr create \ - --title "Security update for ${{ matrix.package.name }} to ${{ matrix.package.version }}" \ - --body-file pr_description.md \ - --base master \ - --head ${{ steps.update.outputs.branch_name }} - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: Cleanup on failure - if: | - steps.check_prs.outputs.skip_pr_creation == 'false' && - (steps.update.outcome == 'failure' || steps.create_pr.outcome == 'failure') && - steps.update.outputs.branch_name != '' - run: | - echo "Cleaning up branch due to workflow failure..." - branch_name="${{ steps.update.outputs.branch_name }}" - - # Check if branch exists before attempting to delete - if git ls-remote --heads origin $branch_name | grep -q $branch_name; then - echo "Deleting branch: $branch_name" - git push origin --delete $branch_name - else - echo "Branch $branch_name does not exist or was not created" - fi + uses: peter-evans/create-pull-request@v7 + with: + commit-message: | + fix(security): update package versions + sign-commits: true + title: | + Security updates + body-path: vulnerability_report.md + delete-branch: true + branch: security-updates + base: ${{ github.head_ref }} + add-paths: uv.lock env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/runscripts/debug/process_vulnerabilities.py b/runscripts/debug/process_vulnerabilities.py new file mode 100644 index 000000000..a98b8f453 --- /dev/null +++ b/runscripts/debug/process_vulnerabilities.py @@ -0,0 +1,185 @@ +""" +Process vulnerability data from comma-separated JSON format. + +This script processes JSON data containing package vulnerability information, +generates a markdown report with vulnerability details, and creates a shell +script to apply package upgrades using uv. + +Expected JSON format: +{ + "name": "package_name", + "version": "current_version", + "vulns": [ + { + "id": "VULNERABILITY_ID", + "fix_versions": ["fixed_version"], + "aliases": ["ALIAS1", "ALIAS2"], + "description": "Vulnerability description" + } + ] +} +""" + +import os +import json +import sys +from typing import Any +from datetime import datetime +import argparse +from packaging.version import Version + + +def generate_markdown_report(packages: list[dict[str, Any]]) -> tuple[str, list[str]]: + """Generate a markdown report of vulnerabilities and upgrades.""" + + markdown = f"""# Security Vulnerability Report + +Generated on: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")} + +## Summary + +Found vulnerabilities in **{len(packages)}** packages requiring updates. + +## Package Upgrades Overview + +| Package | Current Version | Recommended Version | Vulnerabilities | +|---------|----------------|-------------------|-----------------| +""" + + # Package summary table + upgrade_commands = [] + + for pkg in packages: + name = pkg.get("name", "Unknown") + current_version = pkg.get("version", "Unknown") + vulns = pkg.get("vulns", []) + + # Find the highest fix version across all vulnerabilities + all_fix_versions = [] + vuln_count = len(vulns) + + for vuln in vulns: + fix_versions = vuln.get("fix_versions", []) + all_fix_versions.extend([Version(v) for v in fix_versions if v]) + + recommended_version = max(all_fix_versions) if all_fix_versions else "Unknown" + + markdown += f"| **{name}** | {current_version} | **{recommended_version}** | {vuln_count} |\n" + + if recommended_version != "Unknown": + upgrade_commands.append(f'-P "{name}=={recommended_version}"') + + markdown += "\n## Detailed Vulnerability Information\n\n" + + # Detailed vulnerability information + for pkg in packages: + name = pkg.get("name", "Unknown") + current_version = pkg.get("version", "Unknown") + vulns = pkg.get("vulns", []) + + markdown += f"### {name} (v{current_version})\n\n" + + if not vulns: + markdown += "No specific vulnerability details available.\n\n" + continue + + markdown += "| Vulnerability ID | Fix Versions | Aliases |\n" + markdown += "|-----------------|-------------|---------|\n" + + for vuln in vulns: + vuln_id = vuln.get("id", "Unknown") + fix_versions = ", ".join(vuln.get("fix_versions", ["Unknown"])) + aliases = ", ".join(vuln.get("aliases", [])) + + markdown += f"| {vuln_id} | {fix_versions} | {aliases} |\n" + + markdown += "\n" + + markdown += """ +## Recommended Actions + +1. Review the vulnerability details above. +2. Close and reopen this PR to trigger CI/CD tests. +3. Approve and merge the PR if everything looks good. + +--- +*This report was generated automatically. Please verify all upgrades before applying.* +""" + + return markdown, upgrade_commands + + +def main(): + parser = argparse.ArgumentParser( + description="Process vulnerability data and generate reports" + ) + parser.add_argument( + "input_file", + nargs="?", + help="Input file with comma-separated JSONs (default: stdin)", + ) + parser.add_argument( + "--output-md", default="vulnerability_report.md", help="Output markdown file" + ) + parser.add_argument( + "--output-sh", + default="apply_security_upgrades.sh", + help="Output shell script file", + ) + + args = parser.parse_args() + + # Read input data + if args.input_file: + try: + with open(args.input_file, "r") as f: + input_data = json.load(f) + except FileNotFoundError: + print(f"Error: File '{args.input_file}' not found.", file=sys.stderr) + sys.exit(1) + else: + print("Reading from stdin... (Ctrl+D to end)") + input_data = json.load(sys.stdin) + + if not input_data: + print("Error: No input data provided.", file=sys.stderr) + sys.exit(1) + + # Process the data + packages = [pkg for pkg in input_data["dependencies"] if pkg["vulns"]] + + print(f"📋 Detected {len(packages)} vulnerable packages") + + # Generate markdown report + markdown_content, upgrade_commands = generate_markdown_report(packages) + with open(args.output_md, "w") as f: + f.write(markdown_content) + print(f"📄 Markdown report saved to: {args.output_md}") + + # Generate shell script + script = f"""#!/bin/bash +# Security upgrade script +# Generated automatically from vulnerability analysis + +set -e # Exit on any error + +echo "🔒 Applying security upgrades..." +echo "This script will upgrade vulnerable packages using uv lock --upgrade-package" +uv lock {" ".join(upgrade_commands)} + +echo "✅ All security upgrades completed successfully!" +""" + with open(args.output_sh, "w") as f: + f.write(script) + + # Make script executable + os.chmod(args.output_sh, 0o755) + print(f"🔧 Shell script saved to: {args.output_sh} (executable)") + + print("\n✅ Processing complete!") + print(f"Review the report: {args.output_md}") + print(f"Apply upgrades: ./{args.output_sh}") + + +if __name__ == "__main__": + main() From 5ba125f2af6fd35eb8bb5750c2d269d906fef78e Mon Sep 17 00:00:00 2001 From: Sean Cheah Date: Wed, 2 Jul 2025 11:35:31 -0700 Subject: [PATCH 32/71] Run audit on master branch only --- .github/workflows/pip_audit.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/pip_audit.yml b/.github/workflows/pip_audit.yml index fbfb9df11..963e63ba1 100644 --- a/.github/workflows/pip_audit.yml +++ b/.github/workflows/pip_audit.yml @@ -9,8 +9,6 @@ on: - cron: '00 00 * * *' push: branches: [master] - pull_request: - branches: [master] concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -56,7 +54,6 @@ jobs: body-path: vulnerability_report.md delete-branch: true branch: security-updates - base: ${{ github.head_ref }} add-paths: uv.lock env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 0f3c694200da983205c7de83023aefc0122148f6 Mon Sep 17 00:00:00 2001 From: mpg19 Date: Wed, 2 Jul 2025 12:12:10 -0700 Subject: [PATCH 33/71] Added a new tip to Step 4 to account for troubleshooting issues associated with the venv. --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index a000b1b55..07788b1ad 100644 --- a/README.md +++ b/README.md @@ -84,6 +84,10 @@ uv sync --frozen --extra dev uv run pre-commit install ``` +> **Tip:** If uv is not connecting to the venv correctly, or you are running into an error with the +> `uv run pre-commit install` step, try running `rm -rf .venv` to remove the venv, then run +> `uv sync --frozen --extra dev` followed by `uv run pre-commit install` to reinstall the venv. + 5. Install `nextflow` [following these instructions](https://www.nextflow.io/docs/latest/install.html). If your system has `wget` but not `curl`, replace `curl` in the commands with `wget -qO-`. If you choose to install Java with SDKMAN!, after From e4adb2680bcbbe4c56bdf78212eae26df6a5cf7e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 2 Jul 2025 22:21:12 +0000 Subject: [PATCH 34/71] fix(security): update package versions --- uv.lock | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/uv.lock b/uv.lock index 18bbc4489..18abae244 100644 --- a/uv.lock +++ b/uv.lock @@ -1973,21 +1973,19 @@ wheels = [ [[package]] name = "pillow" -version = "11.2.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/af/cb/bb5c01fcd2a69335b86c22142b2bccfc3464087efb7fd382eee5ffc7fdf7/pillow-11.2.1.tar.gz", hash = "sha256:a64dd61998416367b7ef979b73d3a85853ba9bec4c2925f74e588879a58716b6", size = 47026707, upload-time = "2025-04-12T17:50:03.289Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c7/40/052610b15a1b8961f52537cc8326ca6a881408bc2bdad0d852edeb6ed33b/pillow-11.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:78afba22027b4accef10dbd5eed84425930ba41b3ea0a86fa8d20baaf19d807f", size = 3190185, upload-time = "2025-04-12T17:48:00.417Z" }, - { url = "https://files.pythonhosted.org/packages/e5/7e/b86dbd35a5f938632093dc40d1682874c33dcfe832558fc80ca56bfcb774/pillow-11.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:78092232a4ab376a35d68c4e6d5e00dfd73454bd12b230420025fbe178ee3b0b", size = 3030306, upload-time = "2025-04-12T17:48:02.391Z" }, - { url = "https://files.pythonhosted.org/packages/a4/5c/467a161f9ed53e5eab51a42923c33051bf8d1a2af4626ac04f5166e58e0c/pillow-11.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25a5f306095c6780c52e6bbb6109624b95c5b18e40aab1c3041da3e9e0cd3e2d", size = 4416121, upload-time = "2025-04-12T17:48:04.554Z" }, - { url = "https://files.pythonhosted.org/packages/62/73/972b7742e38ae0e2ac76ab137ca6005dcf877480da0d9d61d93b613065b4/pillow-11.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c7b29dbd4281923a2bfe562acb734cee96bbb129e96e6972d315ed9f232bef4", size = 4501707, upload-time = "2025-04-12T17:48:06.831Z" }, - { url = "https://files.pythonhosted.org/packages/e4/3a/427e4cb0b9e177efbc1a84798ed20498c4f233abde003c06d2650a6d60cb/pillow-11.2.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:3e645b020f3209a0181a418bffe7b4a93171eef6c4ef6cc20980b30bebf17b7d", size = 4522921, upload-time = "2025-04-12T17:48:09.229Z" }, - { url = "https://files.pythonhosted.org/packages/fe/7c/d8b1330458e4d2f3f45d9508796d7caf0c0d3764c00c823d10f6f1a3b76d/pillow-11.2.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b2dbea1012ccb784a65349f57bbc93730b96e85b42e9bf7b01ef40443db720b4", size = 4612523, upload-time = "2025-04-12T17:48:11.631Z" }, - { url = "https://files.pythonhosted.org/packages/b3/2f/65738384e0b1acf451de5a573d8153fe84103772d139e1e0bdf1596be2ea/pillow-11.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:da3104c57bbd72948d75f6a9389e6727d2ab6333c3617f0a89d72d4940aa0443", size = 4587836, upload-time = "2025-04-12T17:48:13.592Z" }, - { url = "https://files.pythonhosted.org/packages/6a/c5/e795c9f2ddf3debb2dedd0df889f2fe4b053308bb59a3cc02a0cd144d641/pillow-11.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:598174aef4589af795f66f9caab87ba4ff860ce08cd5bb447c6fc553ffee603c", size = 4669390, upload-time = "2025-04-12T17:48:15.938Z" }, - { url = "https://files.pythonhosted.org/packages/96/ae/ca0099a3995976a9fce2f423166f7bff9b12244afdc7520f6ed38911539a/pillow-11.2.1-cp312-cp312-win32.whl", hash = "sha256:1d535df14716e7f8776b9e7fee118576d65572b4aad3ed639be9e4fa88a1cad3", size = 2332309, upload-time = "2025-04-12T17:48:17.885Z" }, - { url = "https://files.pythonhosted.org/packages/7c/18/24bff2ad716257fc03da964c5e8f05d9790a779a8895d6566e493ccf0189/pillow-11.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:14e33b28bf17c7a38eede290f77db7c664e4eb01f7869e37fa98a5aa95978941", size = 2676768, upload-time = "2025-04-12T17:48:19.655Z" }, - { url = "https://files.pythonhosted.org/packages/da/bb/e8d656c9543276517ee40184aaa39dcb41e683bca121022f9323ae11b39d/pillow-11.2.1-cp312-cp312-win_arm64.whl", hash = "sha256:21e1470ac9e5739ff880c211fc3af01e3ae505859392bf65458c224d0bf283eb", size = 2415087, upload-time = "2025-04-12T17:48:21.991Z" }, +version = "11.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f3/0d/d0d6dea55cd152ce3d6767bb38a8fc10e33796ba4ba210cbab9354b6d238/pillow-11.3.0.tar.gz", hash = "sha256:3828ee7586cd0b2091b6209e5ad53e20d0649bbe87164a459d0676e035e8f523", size = 47113069, upload-time = "2025-07-01T09:16:30.666Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/40/fe/1bc9b3ee13f68487a99ac9529968035cca2f0a51ec36892060edcc51d06a/pillow-11.3.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:fdae223722da47b024b867c1ea0be64e0df702c5e0a60e27daad39bf960dd1e4", size = 5278800, upload-time = "2025-07-01T09:14:17.648Z" }, + { url = "https://files.pythonhosted.org/packages/2c/32/7e2ac19b5713657384cec55f89065fb306b06af008cfd87e572035b27119/pillow-11.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:921bd305b10e82b4d1f5e802b6850677f965d8394203d182f078873851dada69", size = 4686296, upload-time = "2025-07-01T09:14:19.828Z" }, + { url = "https://files.pythonhosted.org/packages/41/f1/6f2427a26fc683e00d985bc391bdd76d8dd4e92fac33d841127eb8fb2313/pillow-11.3.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:97f07ed9f56a3b9b5f49d3661dc9607484e85c67e27f3e8be2c7d28ca032fec7", size = 5977787, upload-time = "2025-07-01T09:14:21.63Z" }, + { url = "https://files.pythonhosted.org/packages/e4/c9/06dd4a38974e24f932ff5f98ea3c546ce3f8c995d3f0985f8e5ba48bba19/pillow-11.3.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:676b2815362456b5b3216b4fd5bd89d362100dc6f4945154ff172e206a22c024", size = 6645236, upload-time = "2025-07-01T09:14:23.321Z" }, + { url = "https://files.pythonhosted.org/packages/40/e7/848f69fb79843b3d91241bad658e9c14f39a32f71a301bcd1d139416d1be/pillow-11.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3e184b2f26ff146363dd07bde8b711833d7b0202e27d13540bfe2e35a323a809", size = 6086950, upload-time = "2025-07-01T09:14:25.237Z" }, + { url = "https://files.pythonhosted.org/packages/0b/1a/7cff92e695a2a29ac1958c2a0fe4c0b2393b60aac13b04a4fe2735cad52d/pillow-11.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6be31e3fc9a621e071bc17bb7de63b85cbe0bfae91bb0363c893cbe67247780d", size = 6723358, upload-time = "2025-07-01T09:14:27.053Z" }, + { url = "https://files.pythonhosted.org/packages/26/7d/73699ad77895f69edff76b0f332acc3d497f22f5d75e5360f78cbcaff248/pillow-11.3.0-cp312-cp312-win32.whl", hash = "sha256:7b161756381f0918e05e7cb8a371fff367e807770f8fe92ecb20d905d0e1c149", size = 6275079, upload-time = "2025-07-01T09:14:30.104Z" }, + { url = "https://files.pythonhosted.org/packages/8c/ce/e7dfc873bdd9828f3b6e5c2bbb74e47a98ec23cc5c74fc4e54462f0d9204/pillow-11.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:a6444696fce635783440b7f7a9fc24b3ad10a9ea3f0ab66c5905be1c19ccf17d", size = 6986324, upload-time = "2025-07-01T09:14:31.899Z" }, + { url = "https://files.pythonhosted.org/packages/16/8f/b13447d1bf0b1f7467ce7d86f6e6edf66c0ad7cf44cf5c87a37f9bed9936/pillow-11.3.0-cp312-cp312-win_arm64.whl", hash = "sha256:2aceea54f957dd4448264f9bf40875da0415c83eb85f55069d89c0ed436e3542", size = 2423067, upload-time = "2025-07-01T09:14:33.709Z" }, ] [[package]] From 2dceefd844e432dc834ce166d28f432247834764 Mon Sep 17 00:00:00 2001 From: Shu-Feather <2200017797@stu.pku.edu.cn> Date: Thu, 3 Jul 2025 11:39:22 +0800 Subject: [PATCH 35/71] Delete Redundance --- ecoli/experiments/ecoli_master_sim.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/ecoli/experiments/ecoli_master_sim.py b/ecoli/experiments/ecoli_master_sim.py index cadf0c9b8..058f2b59a 100644 --- a/ecoli/experiments/ecoli_master_sim.py +++ b/ecoli/experiments/ecoli_master_sim.py @@ -23,9 +23,10 @@ import numpy as np from vivarium.core.engine import Engine +from vivarium.core.composer import deep_merge from vivarium.core.process import Process from vivarium.core.serialize import deserialize_value, serialize_value -from vivarium.library.dict_utils import deep_merge, deep_merge_check +from vivarium.library.dict_utils import deep_merge_check from vivarium.library.topology import inverse_topology from vivarium.library.topology import assoc_path from ecoli.library.logging_tools import write_json @@ -116,13 +117,6 @@ def get_git_diff() -> str: If that fails, tries to read the diff from source-info/git-diff.txt file. Raises an error if both methods fail. """ - # Try to run git command - # try: - # return ( - # subprocess.check_output(["git", "-C", CONFIG_DIR_PATH, "diff", "HEAD"]) - # .decode("ascii") - # .strip() - # ) try: return ( subprocess.check_output(["git", "-C", CONFIG_DIR_PATH, "diff", "HEAD"]) @@ -359,7 +353,6 @@ def __init__( type=float, action="store", help="Initial time in context of whole lineage.", - default=0.0, ) self.parser.add_argument( "--fail_at_max_duration", From 41c8549b387bb0887d3cdc125235b020479e7a1f Mon Sep 17 00:00:00 2001 From: Shu-Feather <2200017797@stu.pku.edu.cn> Date: Thu, 3 Jul 2025 11:49:19 +0800 Subject: [PATCH 36/71] Delete to.panda() --- ecoli/analysis/multigeneration/ribosome_production.py | 2 +- ecoli/analysis/multigeneration/ribosome_usage.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ecoli/analysis/multigeneration/ribosome_production.py b/ecoli/analysis/multigeneration/ribosome_production.py index c2c2aeb9a..28b7c55fb 100644 --- a/ecoli/analysis/multigeneration/ribosome_production.py +++ b/ecoli/analysis/multigeneration/ribosome_production.py @@ -372,7 +372,7 @@ def create_histogram( if not plots: fallback = pl.DataFrame({"message": ["No data available"], "x": [0], "y": [0]}) plots.append( - alt.Chart(fallback.to_pandas()) + alt.Chart(fallback) .mark_text(size=20, color="red") .encode(x="x:Q", y="y:Q", text="message:N") .properties(width=600, height=400, title="No Data") diff --git a/ecoli/analysis/multigeneration/ribosome_usage.py b/ecoli/analysis/multigeneration/ribosome_usage.py index bef7193a1..c14e766ae 100644 --- a/ecoli/analysis/multigeneration/ribosome_usage.py +++ b/ecoli/analysis/multigeneration/ribosome_usage.py @@ -245,7 +245,7 @@ def plot( def create_line_chart(y_field, title, y_title, skip_first_point=False): """Create line chart with optional skipping of first data point.""" - data = plot_df.to_pandas() + data = plot_df if skip_first_point: # Group by variant and generation, skip first point of each group filtered_data = [] @@ -404,7 +404,7 @@ def create_line_chart(y_field, title, y_title, skip_first_point=False): } ) fallback_plot = ( - alt.Chart(fallback_df.to_pandas()) + alt.Chart(fallback_df) .mark_text(size=20, color="red") .encode(x="x:Q", y="y:Q", text="message:N") .properties( @@ -422,7 +422,7 @@ def create_line_chart(y_field, title, y_title, skip_first_point=False): # Ensure both columns have same length by adding empty chart if needed if len(left_plots) > len(right_plots): empty_chart = ( - alt.Chart(pl.DataFrame({"x": [0], "y": [0]}).to_pandas()) + alt.Chart(pl.DataFrame({"x": [0], "y": [0]})) .mark_point(opacity=0) .encode(x="x:Q", y="y:Q") .properties(width=600, height=120) @@ -430,7 +430,7 @@ def create_line_chart(y_field, title, y_title, skip_first_point=False): right_plots.append(empty_chart) elif len(right_plots) > len(left_plots): empty_chart = ( - alt.Chart(pl.DataFrame({"x": [0], "y": [0]}).to_pandas()) + alt.Chart(pl.DataFrame({"x": [0], "y": [0]})) .mark_point(opacity=0) .encode(x="x:Q", y="y:Q") .properties(width=600, height=120) From 9b145b6b1a1ea6e264c7633afd2a6241383a924a Mon Sep 17 00:00:00 2001 From: Shu-Feather <2200017797@stu.pku.edu.cn> Date: Thu, 3 Jul 2025 12:50:28 +0800 Subject: [PATCH 37/71] Remove calling Bulk in ribosome_production.py --- .../multigeneration/ribosome_production.py | 66 +++++++------------ 1 file changed, 24 insertions(+), 42 deletions(-) diff --git a/ecoli/analysis/multigeneration/ribosome_production.py b/ecoli/analysis/multigeneration/ribosome_production.py index 28b7c55fb..93e2b57df 100644 --- a/ecoli/analysis/multigeneration/ribosome_production.py +++ b/ecoli/analysis/multigeneration/ribosome_production.py @@ -22,7 +22,7 @@ def calc_rna_doubling_time( """ production_rate = pl.col(produced_col) / pl.col("time_step_sec") growth_rate = production_rate / pl.col(count_col) - dt_min = np.log(2) / growth_rate / 60 + dt_min = float(np.log(2)) / growth_rate / 60 valid = ( (pl.col(produced_col) >= 0) & (pl.col(count_col) > 0) @@ -84,36 +84,30 @@ def plot( "listeners__ribosome_data__rRNA5S_init_prob", "listeners__ribosome_data__effective_elongation_rate", "listeners__unique_molecule_counts__active_ribosome", - "bulk", ] # load data - # Extract each bulk index into its own column using named_idx(), then sum per rRNA species - idx_groups = {"bulk_16s": idx_16s, "bulk_23s": idx_23s, "bulk_5s": idx_5s} - projections = ( - required_columns - + [ - named_idx(col="bulk", names=[f"{grp}_{i}"], idx=[[i]], zero_to_null=True) - for grp, idxs in idx_groups.items() - for i in idxs - ] - + [ - f"({' + '.join(f'{grp}_{i}' for i in idxs)}) AS {grp}_count" - for grp, idxs in idx_groups.items() - ] - ) + # Create the bulk index expressions + bulk_16s_expr = named_idx("bulk", [f"bulk_{i}" for i in idx_16s], [idx_16s]) + bulk_23s_expr = named_idx("bulk", [f"bulk_{i}" for i in idx_23s], [idx_23s]) + bulk_5s_expr = named_idx("bulk", [f"bulk_{i}" for i in idx_5s], [idx_5s]) + + # Combine all columns and expressions + all_columns = ", ".join(required_columns) + bulk_expressions = ", ".join([bulk_16s_expr, bulk_23s_expr, bulk_5s_expr]) + # Build the SQL query sql = f""" - SELECT {", ".join(projections)} + SELECT {all_columns}, {bulk_expressions} FROM ({history_sql}) WHERE agent_id = 0 ORDER BY generation, time """ + df = conn.sql(sql).pl() # time df = df.with_columns((pl.col("time") / 60).alias("time_min")) - df = df.with_columns( pl.col("time") .diff() @@ -128,7 +122,9 @@ def plot( # cell doubling time if "listeners__mass__instantaneous_growth_rate" in df.columns: - val = np.log(2) / pl.col("listeners__mass__instantaneous_growth_rate") / 60 + val = ( + float(np.log(2)) / pl.col("listeners__mass__instantaneous_growth_rate") / 60 + ) df = df.with_columns( pl.when(val.is_between(0, 2 * sim_doubling_time, closed="both")) .then(val) @@ -138,29 +134,15 @@ def plot( df = df.with_columns( [ - # compute bulk rRNA counts - pl.col("bulk") - .map_elements( - lambda arr: sum(arr[i] for i in idx_16s if i < len(arr)), - return_dtype=pl.Float64, - ) - .fill_null(0) - .alias("bulk_16s_count"), - pl.col("bulk") - .map_elements( - lambda arr: sum(arr[i] for i in idx_23s if i < len(arr)), - return_dtype=pl.Float64, - ) - .fill_null(0) - .alias("bulk_23s_count"), - pl.col("bulk") - .map_elements( - lambda arr: sum(arr[i] for i in idx_5s if i < len(arr)), - return_dtype=pl.Float64, - ) - .fill_null(0) - .alias("bulk_5s_count"), - # compute unique ribosomes + pl.sum_horizontal([pl.col(f"bulk_{i}") for i in idx_16s]).alias( + "bulk_16s_count" + ), + pl.sum_horizontal([pl.col(f"bulk_{i}") for i in idx_23s]).alias( + "bulk_23s_count" + ), + pl.sum_horizontal([pl.col(f"bulk_{i}") for i in idx_5s]).alias( + "bulk_5s_count" + ), pl.col("listeners__unique_molecule_counts__active_ribosome") .fill_null(0) .alias("ribosome_count"), From 1b6973a9a3f98b853b9341fc88ababdfbc92d0cc Mon Sep 17 00:00:00 2001 From: Shu-Feather <2200017797@stu.pku.edu.cn> Date: Thu, 3 Jul 2025 13:08:26 +0800 Subject: [PATCH 38/71] Remove calling Bulk in ribosome_usage.py --- .../multigeneration/ribosome_usage.py | 51 ++++++++----------- 1 file changed, 20 insertions(+), 31 deletions(-) diff --git a/ecoli/analysis/multigeneration/ribosome_usage.py b/ecoli/analysis/multigeneration/ribosome_usage.py index c14e766ae..b49ae5249 100644 --- a/ecoli/analysis/multigeneration/ribosome_usage.py +++ b/ecoli/analysis/multigeneration/ribosome_usage.py @@ -18,9 +18,7 @@ import pandas as pd import numpy as np -from ecoli.library.parquet_emitter import ( - open_arbitrary_sim_data, -) +from ecoli.library.parquet_emitter import open_arbitrary_sim_data, named_idx from ecoli.library.schema import bulk_name_to_idx # ----------------------------------------- # @@ -77,23 +75,24 @@ def plot( "listeners__ribosome_data__did_terminate", "listeners__ribosome_data__effective_elongation_rate", "listeners__unique_molecule_counts__active_ribosome", - "bulk", ] - # Check available columns - available_columns = ( - conn.sql(f"DESCRIBE ({history_sql})").pl()["column_name"].to_list() - ) - data_columns = [col for col in required_columns if col in available_columns] + # Create the bulk index expressions + expr_30s = named_idx("bulk", [f"bulk_30s_{i}" for i in idx_30s], [idx_30s]) + expr_50s = named_idx("bulk", [f"bulk_50s_{i}" for i in idx_50s], [idx_50s]) - print(f"[INFO] Loading {len(data_columns)} columns for ribosome usage analysis") + # load data + sql = f""" + SELECT + {", ".join(required_columns)}, + {expr_30s}, + {expr_50s} + FROM ({history_sql}) + WHERE agent_id = 0 + ORDER BY generation, time + """ - df = conn.sql(f""" - SELECT {", ".join(data_columns)} - FROM ({history_sql}) - WHERE agent_id = 0 - ORDER BY variant, generation, time - """).pl() + df = conn.sql(sql).pl() # Convert time if "time" in df.columns: @@ -101,23 +100,13 @@ def plot( df = df.with_columns([(pl.col("time") + 1).alias("time_step_sec")]) # Calculate ribosome subunit counts + cols_30s = [c for c in df.columns if c.startswith("bulk_30s_")] + cols_50s = [c for c in df.columns if c.startswith("bulk_50s_")] df = df.with_columns( [ # compute bulk ribosome subunit counts - pl.col("bulk") - .map_elements( - lambda arr: sum(arr[i] for i in idx_30s if i < len(arr)), - return_dtype=pl.Float64, - ) - .fill_null(0) - .alias("counts_30s"), - pl.col("bulk") - .map_elements( - lambda arr: sum(arr[i] for i in idx_50s if i < len(arr)), - return_dtype=pl.Float64, - ) - .fill_null(0) - .alias("counts_50s"), + pl.sum_horizontal(cols_30s).alias("counts_30s"), + pl.sum_horizontal(cols_50s).alias("counts_50s"), # compute unique ribosomes pl.col("listeners__unique_molecule_counts__active_ribosome") .fill_null(0) @@ -245,7 +234,7 @@ def plot( def create_line_chart(y_field, title, y_title, skip_first_point=False): """Create line chart with optional skipping of first data point.""" - data = plot_df + data = plot_df.to_pandas() if skip_first_point: # Group by variant and generation, skip first point of each group filtered_data = [] From 2747599281855f2989bb6467e839f48fef709618 Mon Sep 17 00:00:00 2001 From: Shu-Feather <2200017797@stu.pku.edu.cn> Date: Thu, 3 Jul 2025 13:40:20 +0800 Subject: [PATCH 39/71] Modification for 2 passes through the data --- .../multigeneration/ribosome_crowding.py | 250 +++++++----------- 1 file changed, 89 insertions(+), 161 deletions(-) diff --git a/ecoli/analysis/multigeneration/ribosome_crowding.py b/ecoli/analysis/multigeneration/ribosome_crowding.py index dc98402cb..08ffeb71a 100644 --- a/ecoli/analysis/multigeneration/ribosome_crowding.py +++ b/ecoli/analysis/multigeneration/ribosome_crowding.py @@ -5,24 +5,20 @@ import altair as alt import os from typing import Any - -from duckdb import DuckDBPyConnection import pickle import polars as pl -import numpy as np +from duckdb import DuckDBPyConnection from ecoli.library.parquet_emitter import ( field_metadata, open_arbitrary_sim_data, named_idx, - read_stacked_columns, ) -# ----------------------------------------- # - -# Set this to ensure maximum figure size is not exceeded MAX_NUMBER_OF_MONOMERS_TO_PLOT = 300 +# ----------------------------------------- # + def plot( params: dict[str, Any], @@ -37,192 +33,124 @@ def plot( variant_names: dict[str, str], ): """ - Comparison of target translation probabilities vs actual translation - probabilities for mRNAs whose translation probabilities exceeded the limit - set by the physical size and the elongation rates of ribosomes. + Comparison of target vs actual translation probabilities for overcrowded mRNAs. """ - # Load sim_data to get monomer information + # Load sim_data for monomer mappings with open_arbitrary_sim_data(sim_data_dict) as f: sim_data = pickle.load(f) - # Get monomer IDs and mappings - mRNA_sim_data = sim_data.process.transcription.cistron_data.struct_array - monomer_sim_data = sim_data.process.translation.monomer_data.struct_array - monomer_ids = monomer_sim_data["id"].tolist() - - # Build mappings: monomer_id -> mRNA_id -> gene_id - monomer_to_mRNA_id_dict = dict( - zip(monomer_sim_data["id"], monomer_sim_data["cistron_id"]) - ) - mRNA_to_gene_id_dict = dict(zip(mRNA_sim_data["id"], mRNA_sim_data["gene_id"])) + # Get monomer and gene mappings + mRNA_data = sim_data.process.transcription.cistron_data.struct_array + monomer_data = sim_data.process.translation.monomer_data.struct_array + + monomer_to_gene = {} + for mono_id, cistron_id in zip(monomer_data["id"], monomer_data["cistron_id"]): + gene_id = next( + ( + g + for c, g in zip(mRNA_data["id"], mRNA_data["gene_id"]) + if c == cistron_id + ), + "Unknown", + ) + monomer_to_gene[mono_id] = gene_id - # Get field metadata for ribosome data + # Get field metadata try: - target_field_names = field_metadata( + field_names = field_metadata( conn, config_sql, "listeners__ribosome_data__target_prob_translation_per_transcript", ) - actual_field_names = field_metadata( - conn, - config_sql, - "listeners__ribosome_data__actual_prob_translation_per_transcript", - ) except Exception as e: - print(f"Error getting field metadata: {e}") - print("Trying alternative listener names...") - try: - target_field_names = field_metadata( - conn, config_sql, "listeners__ribosome_data" - ) - actual_field_names = target_field_names # Assume same structure - except Exception as e2: - print(f"Alternative approach also failed: {e2}") - return - - # Find indices for each monomer in the field metadata - target_monomer_indices = [] - actual_monomer_indices = [] - valid_monomer_ids = [] - - for i, monomer_id in enumerate(monomer_ids): - if monomer_id in target_field_names and monomer_id in actual_field_names: - target_idx = target_field_names.index(monomer_id) - actual_idx = actual_field_names.index(monomer_id) - target_monomer_indices.append(target_idx) - actual_monomer_indices.append(actual_idx) - valid_monomer_ids.append(monomer_id) - - if not valid_monomer_ids: - print("No valid monomer IDs found in ribosome data fields.") + print(f"[ERROR] Error getting field metadata: {e}") return - print(f"[INFO] Found {len(valid_monomer_ids)} valid monomer IDs") - - # Create named indices for data reading - target_named = named_idx( - "listeners__ribosome_data__target_prob_translation_per_transcript", - valid_monomer_ids, - [target_monomer_indices], - ) - actual_named = named_idx( - "listeners__ribosome_data__actual_prob_translation_per_transcript", - valid_monomer_ids, - [actual_monomer_indices], + # First pass: Find overcrowded monomer indices + overcrowded_query = f""" + WITH unnested AS ( + SELECT + unnest(listeners__ribosome_data__actual_prob_translation_per_transcript) as actual, + unnest(listeners__ribosome_data__target_prob_translation_per_transcript) as target, + generate_subscripts(listeners__ribosome_data__target_prob_translation_per_transcript, 1) as idx + FROM ({history_sql}) ) + SELECT DISTINCT idx + FROM unnested + WHERE target > actual + ORDER BY idx + LIMIT {MAX_NUMBER_OF_MONOMERS_TO_PLOT} + """ - # Read target and actual data separately to ensure proper structure - try: - # Read target data - target_data = read_stacked_columns( - history_sql, - [target_named], - conn=conn, - ) - target_df = pl.DataFrame(target_data).with_columns( - **{"Time (min)": pl.col("time") / 60} - ) - - # Read actual data - actual_data = read_stacked_columns( - history_sql, - [actual_named], - conn=conn, - ) - actual_df = pl.DataFrame(actual_data).with_columns( - **{"Time (min)": pl.col("time") / 60} - ) - - # Get the probability columns - target_prob_cols = [ - col for col in target_df.columns if col in valid_monomer_ids - ] - actual_prob_cols = [ - col for col in actual_df.columns if col in valid_monomer_ids - ] - - if not target_prob_cols or not actual_prob_cols: - print("Could not find probability columns in datasets") - return - - # Create arrays for calculation - target_prob_array = target_df.select(target_prob_cols).to_numpy() - actual_prob_array = actual_df.select(actual_prob_cols).to_numpy() - time_min = target_df["Time (min)"].to_numpy() + overcrowded_indices = [ + row[0] - 1 for row in conn.execute(overcrowded_query).fetchall() + ] # Convert to 0-based - print("[INFO] Successfully read target and actual data") - print( - f"[INFO] Target shape: {target_prob_array.shape}, Actual shape: {actual_prob_array.shape}" - ) - - except Exception as e: - print(f"Failed to read separate datasets: {e}") + if not overcrowded_indices: + print("[INFO] No overcrowded monomers found.") return - # Calculate differences to find overcrowded mRNAs - prob_differences = target_prob_array - actual_prob_array - overcrowded_monomer_indexes = np.where(prob_differences.max(axis=0) > 0)[0] - n_overcrowded_monomers = len(overcrowded_monomer_indexes) + print(f"[INFO] Found {len(overcrowded_indices)} overcrowded monomers") - print(f"[INFO] Found {n_overcrowded_monomers} overcrowded monomers") + # Second pass: Get data for overcrowded monomers only + actual_expr = named_idx( + "listeners__ribosome_data__actual_prob_translation_per_transcript", + [f"actual_{i}" for i in range(len(overcrowded_indices))], + [overcrowded_indices], + ) - if n_overcrowded_monomers == 0: - print("No overcrowded mRNAs found in the simulation.") - return + target_expr = named_idx( + "listeners__ribosome_data__target_prob_translation_per_transcript", + [f"target_{i}" for i in range(len(overcrowded_indices))], + [overcrowded_indices], + ) - # Get gene IDs for overcrowded monomers - overcrowded_monomer_ids = [ - valid_monomer_ids[i] for i in overcrowded_monomer_indexes - ] - overcrowded_gene_ids = [ - mRNA_to_gene_id_dict.get(monomer_to_mRNA_id_dict.get(monomer_id), "unknown") - for monomer_id in overcrowded_monomer_ids - ] + data_query = f"SELECT {actual_expr}, {target_expr}, time FROM ({history_sql})" + df = conn.execute(data_query).fetchdf() + # ----------------------------------------- # + # Prepare plot data following original format + plot_data = [] + n_overcrowded_monomers = len(overcrowded_indices) n_overcrowded_monomers_to_plot = min( n_overcrowded_monomers, MAX_NUMBER_OF_MONOMERS_TO_PLOT ) - # ----------------------------------------- # - - plot_data = [] - for i, monomer_index in enumerate(overcrowded_monomer_indexes): + for i, idx in enumerate(overcrowded_indices): if i >= MAX_NUMBER_OF_MONOMERS_TO_PLOT: break - gene_id = overcrowded_gene_ids[i] - - # Get the data for this monomer - target_probs = target_prob_array[:, monomer_index] - actual_probs = actual_prob_array[:, monomer_index] - - # Add target probabilities - for j, time_val in enumerate(time_min): - plot_data.append( - { - "Time_min": float(time_val), - "Gene_ID": str(gene_id), - "Probability_Type": "target", - "Translation_Probability": float(target_probs[j]), - "Plot_Order": i, - } - ) - - # Add actual probabilities - for j, time_val in enumerate(time_min): - plot_data.append( - { - "Time_min": float(time_val), - "Gene_ID": str(gene_id), - "Probability_Type": "actual", - "Translation_Probability": float(actual_probs[j]), - "Plot_Order": i, - } - ) + if idx < len(field_names): + monomer_id = field_names[idx] + gene_id = monomer_to_gene.get(monomer_id, "Unknown") + + # Add target probabilities + for _, row in df.iterrows(): + plot_data.append( + { + "Time_min": float(row["time"]), + "Gene_ID": str(gene_id), + "Probability_Type": "target", + "Translation_Probability": float(row[f"target_{i}"]), + "Plot_Order": i, + } + ) + + # Add actual probabilities + for _, row in df.iterrows(): + plot_data.append( + { + "Time_min": float(row["time"]), + "Gene_ID": str(gene_id), + "Probability_Type": "actual", + "Translation_Probability": float(row[f"actual_{i}"]), + "Plot_Order": i, + } + ) if not plot_data: - print("No data prepared for plotting") + print("[INFO] No data prepared for plotting") return plot_df = pl.DataFrame(plot_data) From bbd540c5a1b11b481f2a5b3e391905f7f9e00733 Mon Sep 17 00:00:00 2001 From: Shu-Feather <2200017797@stu.pku.edu.cn> Date: Fri, 4 Jul 2025 01:22:45 +0800 Subject: [PATCH 40/71] Use unpivot to simplify the loop in ribosome_crowding.py --- .../multigeneration/ribosome_crowding.py | 114 ++++++++++-------- 1 file changed, 67 insertions(+), 47 deletions(-) diff --git a/ecoli/analysis/multigeneration/ribosome_crowding.py b/ecoli/analysis/multigeneration/ribosome_crowding.py index 08ffeb71a..e65e7aa8b 100644 --- a/ecoli/analysis/multigeneration/ribosome_crowding.py +++ b/ecoli/analysis/multigeneration/ribosome_crowding.py @@ -91,19 +91,36 @@ def plot( print("[INFO] No overcrowded monomers found.") return - print(f"[INFO] Found {len(overcrowded_indices)} overcrowded monomers") + n_overcrowded_monomers = len(overcrowded_indices) + n_overcrowded_monomers_to_plot = min( + n_overcrowded_monomers, MAX_NUMBER_OF_MONOMERS_TO_PLOT + ) + + print(f"[INFO] Found {n_overcrowded_monomers} overcrowded monomers") # Second pass: Get data for overcrowded monomers only + actual_columns = [] + target_columns = [] + + for i, idx in enumerate(overcrowded_indices): + if i >= n_overcrowded_monomers_to_plot: + break + if idx < len(field_names): + monomer_id = field_names[idx] + gene_id = monomer_to_gene.get(monomer_id, "Unknown") + actual_columns.append(f"actual__{gene_id}") + target_columns.append(f"target__{gene_id}") + actual_expr = named_idx( "listeners__ribosome_data__actual_prob_translation_per_transcript", - [f"actual_{i}" for i in range(len(overcrowded_indices))], - [overcrowded_indices], + actual_columns, + [overcrowded_indices[: len(actual_columns)]], ) target_expr = named_idx( "listeners__ribosome_data__target_prob_translation_per_transcript", - [f"target_{i}" for i in range(len(overcrowded_indices))], - [overcrowded_indices], + target_columns, + [overcrowded_indices[: len(target_columns)]], ) data_query = f"SELECT {actual_expr}, {target_expr}, time FROM ({history_sql})" @@ -111,50 +128,53 @@ def plot( # ----------------------------------------- # # Prepare plot data following original format - plot_data = [] - n_overcrowded_monomers = len(overcrowded_indices) - n_overcrowded_monomers_to_plot = min( - n_overcrowded_monomers, MAX_NUMBER_OF_MONOMERS_TO_PLOT - ) - - for i, idx in enumerate(overcrowded_indices): - if i >= MAX_NUMBER_OF_MONOMERS_TO_PLOT: - break - - if idx < len(field_names): - monomer_id = field_names[idx] - gene_id = monomer_to_gene.get(monomer_id, "Unknown") - - # Add target probabilities - for _, row in df.iterrows(): - plot_data.append( - { - "Time_min": float(row["time"]), - "Gene_ID": str(gene_id), - "Probability_Type": "target", - "Translation_Probability": float(row[f"target_{i}"]), - "Plot_Order": i, - } - ) - - # Add actual probabilities - for _, row in df.iterrows(): - plot_data.append( - { - "Time_min": float(row["time"]), - "Gene_ID": str(gene_id), - "Probability_Type": "actual", - "Translation_Probability": float(row[f"actual_{i}"]), - "Plot_Order": i, - } + pl_df = pl.DataFrame(df) + + # Get all probability columns (both actual and target) + prob_columns = actual_columns + target_columns + + # Unpivot the data + plot_df = ( + pl_df.unpivot( + index=["time"], + on=prob_columns, + variable_name="variable", + value_name="Translation_Probability", + ) + .with_columns( + [ + # Split variable name into probability type and gene ID + pl.col("variable") + .str.split_exact("__", 1) + .struct.rename_fields(["Probability_Type", "Gene_ID"]), + (pl.col("time") / 60).alias("Time_min"), + ] + ) + .unnest("variable") + .with_columns( + [ + # Add plot order for consistent ordering + pl.col("Gene_ID") + .map_elements( + lambda x: next( + ( + i + for i, idx in enumerate( + overcrowded_indices[: len(actual_columns)] + ) + if idx < len(field_names) + and monomer_to_gene.get(field_names[idx], "Unknown") == x + ), + 0, + ), + return_dtype=pl.Int32, ) + .alias("Plot_Order") + ] + ) + ) - if not plot_data: - print("[INFO] No data prepared for plotting") - return - - plot_df = pl.DataFrame(plot_data) - + # ----------------------------------------- # # Create individual plots for each overcrowded gene charts = [] for i in range(n_overcrowded_monomers_to_plot): From d89335270725d9eaa999a064579ab348681b7c1e Mon Sep 17 00:00:00 2001 From: Shu-Feather <2200017797@stu.pku.edu.cn> Date: Fri, 4 Jul 2025 01:44:45 +0800 Subject: [PATCH 41/71] Delete redundant column --- .../multigeneration/ribosome_crowding.py | 28 ++++--------------- 1 file changed, 5 insertions(+), 23 deletions(-) diff --git a/ecoli/analysis/multigeneration/ribosome_crowding.py b/ecoli/analysis/multigeneration/ribosome_crowding.py index e65e7aa8b..7fced9d75 100644 --- a/ecoli/analysis/multigeneration/ribosome_crowding.py +++ b/ecoli/analysis/multigeneration/ribosome_crowding.py @@ -151,34 +151,16 @@ def plot( ] ) .unnest("variable") - .with_columns( - [ - # Add plot order for consistent ordering - pl.col("Gene_ID") - .map_elements( - lambda x: next( - ( - i - for i, idx in enumerate( - overcrowded_indices[: len(actual_columns)] - ) - if idx < len(field_names) - and monomer_to_gene.get(field_names[idx], "Unknown") == x - ), - 0, - ), - return_dtype=pl.Int32, - ) - .alias("Plot_Order") - ] - ) ) + # Get unique gene IDs in the order they appear in the data + unique_genes = plot_df["Gene_ID"].unique().to_list() + # ----------------------------------------- # # Create individual plots for each overcrowded gene charts = [] - for i in range(n_overcrowded_monomers_to_plot): - gene_data = plot_df.filter(pl.col("Plot_Order") == i) + for i, gene_id in enumerate(unique_genes[:n_overcrowded_monomers_to_plot]): + gene_data = plot_df.filter(pl.col("Gene_ID") == gene_id) if gene_data.height == 0: continue From 1d43de9ab3a6f22f564974836f89cae5add1903d Mon Sep 17 00:00:00 2001 From: Shu-Feather <2200017797@stu.pku.edu.cn> Date: Fri, 4 Jul 2025 03:07:31 +0800 Subject: [PATCH 42/71] Modification for plot and explanation --- ecoli/analysis/multigeneration/ribosome_crowding.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/ecoli/analysis/multigeneration/ribosome_crowding.py b/ecoli/analysis/multigeneration/ribosome_crowding.py index 7fced9d75..97b54c2a8 100644 --- a/ecoli/analysis/multigeneration/ribosome_crowding.py +++ b/ecoli/analysis/multigeneration/ribosome_crowding.py @@ -68,6 +68,7 @@ def plot( return # First pass: Find overcrowded monomer indices + # If gene X's target > actual at any timepoint t, it'll be marked as overcrowded. overcrowded_query = f""" WITH unnested AS ( SELECT @@ -181,17 +182,12 @@ def plot( color=alt.Color( "Probability_Type:N", scale=alt.Scale( - domain=["target", "actual"], range=["#1f77b4", "#ff7f0e"] + # actually, the blue target line will cover the orange actual line if they are the same + domain=["target", "actual"], + range=["#1f77b4", "#ff7f0e"], ), legend=alt.Legend(title="Type") if i == 0 else None, ), - strokeDash=alt.StrokeDash( - "Probability_Type:N", - scale=alt.Scale( - domain=["target", "actual"], range=[[1, 0], [5, 5]] - ), - legend=None, - ), tooltip=[ alt.Tooltip("Time_min:Q", title="Time (min)", format=".2f"), alt.Tooltip( From cd8b2fad384f1728b08d65c164096d8a3b0c853a Mon Sep 17 00:00:00 2001 From: Shu-Feather <2200017797@stu.pku.edu.cn> Date: Tue, 8 Jul 2025 03:20:27 +0800 Subject: [PATCH 43/71] Add cell_mass analysis used for multivariant simulation --- ecoli/analysis/multivariant/cell_mass.py | 160 +++++++++++++++++++++++ 1 file changed, 160 insertions(+) create mode 100644 ecoli/analysis/multivariant/cell_mass.py diff --git a/ecoli/analysis/multivariant/cell_mass.py b/ecoli/analysis/multivariant/cell_mass.py new file mode 100644 index 000000000..96bee9d3e --- /dev/null +++ b/ecoli/analysis/multivariant/cell_mass.py @@ -0,0 +1,160 @@ +""" +Plot absolue / normalized cell mass over time for multivariant simulation in vEcoli, and: +1. each variant has its own plot; +2. at each subplot, time is divided by generation id; + +It can also be used at multigeneration analysis. +""" + +import os +from typing import Any +import altair as alt +import polars as pl +import pandas as pd +from duckdb import DuckDBPyConnection + + +def plot( + params: dict[str, Any], + conn: DuckDBPyConnection, + history_sql: str, + config_sql: str, + success_sql: str, + sim_data_dict: dict[str, dict[int, str]], + validation_data_paths: list[str], + outdir: str, + variant_metadata: dict[str, dict[int, Any]], + variant_names: dict[str, str], +): + # Load data with required columns + required_columns = [ + "time", + "variant", + "generation", + "agent_id", + "listeners__mass__dry_mass", + ] + + sql = f""" + SELECT {", ".join(required_columns)} + FROM ({history_sql}) + WHERE agent_id = 0 + ORDER BY variant, generation, time + """ + + df = conn.sql(sql).pl() + + # Process time and mass data + df = df.with_columns( + [ + (pl.col("time") / 60).alias("time_min"), + ] + ) + + # Calculate initial mass for each generation for normalization + generation_stats = df.group_by(["variant", "generation"]).agg( + [pl.col("listeners__mass__dry_mass").min().alias("initial_mass")] + ) + + # Join back to main dataframe and calculate normalized mass + df = df.join(generation_stats, on=["variant", "generation"], how="left") + + # Calculate normalized mass + df = df.with_columns( + [ + (pl.col("listeners__mass__dry_mass") / pl.col("initial_mass")).alias( + "dry_mass_normalized" + ) + ] + ) + + # Get variants and create plots + variants = df.select("variant").unique().to_series().to_list() + + # ----------------------------------------# + plots = [] + + for variant in variants: + variant_df = df.filter(pl.col("variant") == variant).to_pandas() + variant_name = variant_names.get(variant, f"Variant {variant}") + + # Create base chart with line plots only + base = alt.Chart(variant_df).add_selection( + alt.selection_interval(bind="scales") + ) + + # Base encoding + base_encode = { + "x": alt.X("time_min:Q", title="Time (min)", scale=alt.Scale(nice=False)), + "color": alt.Color( + "generation:N", + legend=alt.Legend(title="Generation"), + scale=alt.Scale(scheme="category10"), + ), + "tooltip": ["time_min:Q", "generation:N"], + } + + # Absolute dry mass plot + mass_plot = ( + base.mark_line(strokeWidth=2.5) + .encode( + y=alt.Y( + "listeners__mass__dry_mass:Q", + title="Dry Mass (fg)", + scale=alt.Scale(nice=False), + ), + tooltip=base_encode["tooltip"] + ["listeners__mass__dry_mass:Q"], + **{k: v for k, v in base_encode.items() if k != "tooltip"}, + ) + .properties( + width=400, height=200, title=f"{variant_name} - Absolute Dry Mass" + ) + ) + + # Normalized dry mass plot + norm_mass_plot = ( + base.mark_line(strokeWidth=2.5) + .encode( + y=alt.Y( + "dry_mass_normalized:Q", + title="Normalized Dry Mass", + scale=alt.Scale(nice=False), + ), + tooltip=base_encode["tooltip"] + ["dry_mass_normalized:Q"], + **{k: v for k, v in base_encode.items() if k != "tooltip"}, + ) + .properties( + width=400, height=200, title=f"{variant_name} - Normalized Dry Mass" + ) + ) + + # Add reference line at y=2 (doubling mass) + reference_line = ( + alt.Chart(pd.DataFrame({"y": [2]})) + .mark_rule(color="red", strokeDash=[5, 5], strokeWidth=1) + .encode(y="y:Q") + ) + + norm_mass_plot = norm_mass_plot + reference_line + + # Combine plots for this variant + variant_combined = ( + alt.hconcat(mass_plot, norm_mass_plot) + .resolve_scale(x="shared") + .properties(title=f"{variant_name} Cell Mass Analysis") + ) + + plots.append(variant_combined) + + # Create combined plot + final_plot = plots[0] if len(plots) == 1 else alt.vconcat(*plots) + final_plot = final_plot.resolve_scale(x="independent", y="independent").properties( + title="Multi-Variant Cell Mass Analysis" + ) + + # Save plot + out_path = os.path.join(outdir, "multivariant_cell_mass_report.html") + final_plot.save(out_path) + print(f"Saved multi-variant cell mass visualization to: {out_path}") + + return final_plot From 13cad89a299f319501821c998039b9b63b85b3aa Mon Sep 17 00:00:00 2001 From: Shu-Feather <2200017797@stu.pku.edu.cn> Date: Tue, 8 Jul 2025 03:34:25 +0800 Subject: [PATCH 44/71] Modification --- ecoli/analysis/multivariant/cell_mass.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ecoli/analysis/multivariant/cell_mass.py b/ecoli/analysis/multivariant/cell_mass.py index 96bee9d3e..5b0b030eb 100644 --- a/ecoli/analysis/multivariant/cell_mass.py +++ b/ecoli/analysis/multivariant/cell_mass.py @@ -7,7 +7,7 @@ """ import os -from typing import Any +from typing import Any, Dict, List import altair as alt import polars as pl import pandas as pd @@ -84,7 +84,7 @@ def plot( ) # Base encoding - base_encode = { + base_encode: Dict[str, List[str]] = { "x": alt.X("time_min:Q", title="Time (min)", scale=alt.Scale(nice=False)), "color": alt.Color( "generation:N", From 2cd3804aa06a19f78ea3ac6e5dd5d8eba37d1a1d Mon Sep 17 00:00:00 2001 From: Shu-Feather <2200017797@stu.pku.edu.cn> Date: Tue, 8 Jul 2025 03:50:01 +0800 Subject: [PATCH 45/71] Modification --- ecoli/analysis/multivariant/cell_mass.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ecoli/analysis/multivariant/cell_mass.py b/ecoli/analysis/multivariant/cell_mass.py index 5b0b030eb..0132d7d1f 100644 --- a/ecoli/analysis/multivariant/cell_mass.py +++ b/ecoli/analysis/multivariant/cell_mass.py @@ -7,7 +7,7 @@ """ import os -from typing import Any, Dict, List +from typing import Any import altair as alt import polars as pl import pandas as pd @@ -84,14 +84,15 @@ def plot( ) # Base encoding - base_encode: Dict[str, List[str]] = { + tooltip_fields: list[str] = ["time_min:Q", "generation:N"] + base_encode = { "x": alt.X("time_min:Q", title="Time (min)", scale=alt.Scale(nice=False)), "color": alt.Color( "generation:N", legend=alt.Legend(title="Generation"), scale=alt.Scale(scheme="category10"), ), - "tooltip": ["time_min:Q", "generation:N"], + "tooltip": tooltip_fields, } # Absolute dry mass plot From 46ce7cc25a4437960dde6bfcea2069a16e1ee586 Mon Sep 17 00:00:00 2001 From: Shu-Feather <2200017797@stu.pku.edu.cn> Date: Tue, 8 Jul 2025 04:00:02 +0800 Subject: [PATCH 46/71] Modification --- ecoli/analysis/multivariant/cell_mass.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/ecoli/analysis/multivariant/cell_mass.py b/ecoli/analysis/multivariant/cell_mass.py index 0132d7d1f..9321ea721 100644 --- a/ecoli/analysis/multivariant/cell_mass.py +++ b/ecoli/analysis/multivariant/cell_mass.py @@ -92,20 +92,20 @@ def plot( legend=alt.Legend(title="Generation"), scale=alt.Scale(scheme="category10"), ), - "tooltip": tooltip_fields, } # Absolute dry mass plot mass_plot = ( base.mark_line(strokeWidth=2.5) .encode( + x=base_encode["x"], + color=base_encode["color"], + tooltip=tooltip_fields + ["listeners__mass__dry_mass:Q"], y=alt.Y( "listeners__mass__dry_mass:Q", title="Dry Mass (fg)", scale=alt.Scale(nice=False), ), - tooltip=base_encode["tooltip"] + ["listeners__mass__dry_mass:Q"], - **{k: v for k, v in base_encode.items() if k != "tooltip"}, ) .properties( width=400, height=200, title=f"{variant_name} - Absolute Dry Mass" @@ -116,13 +116,14 @@ def plot( norm_mass_plot = ( base.mark_line(strokeWidth=2.5) .encode( + x=base_encode["x"], + color=base_encode["color"], + tooltip=tooltip_fields + ["listeners__mass__dry_mass:Q"], y=alt.Y( "dry_mass_normalized:Q", title="Normalized Dry Mass", scale=alt.Scale(nice=False), ), - tooltip=base_encode["tooltip"] + ["dry_mass_normalized:Q"], - **{k: v for k, v in base_encode.items() if k != "tooltip"}, ) .properties( width=400, height=200, title=f"{variant_name} - Normalized Dry Mass" From 0c790c04195b1aa6d79f0bf377c49d124d823082 Mon Sep 17 00:00:00 2001 From: Sean Cheah Date: Tue, 8 Jul 2025 12:03:40 -0700 Subject: [PATCH 47/71] Add vl-convert dependency Required to save Altair charts as PNG, SVG, offline HTML, PDF --- pyproject.toml | 2 ++ uv.lock | 17 +++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 4e094894b..21e4b2ecc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,6 +50,8 @@ dependencies = [ "stochastic-arrow", "autograd", "xmltodict", + # Required to save Altair charts as PNG + "vl-convert-python", ] [project.optional-dependencies] diff --git a/uv.lock b/uv.lock index 18abae244..92254fbb4 100644 --- a/uv.lock +++ b/uv.lock @@ -1979,6 +1979,8 @@ sdist = { url = "https://files.pythonhosted.org/packages/f3/0d/d0d6dea55cd152ce3 wheels = [ { url = "https://files.pythonhosted.org/packages/40/fe/1bc9b3ee13f68487a99ac9529968035cca2f0a51ec36892060edcc51d06a/pillow-11.3.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:fdae223722da47b024b867c1ea0be64e0df702c5e0a60e27daad39bf960dd1e4", size = 5278800, upload-time = "2025-07-01T09:14:17.648Z" }, { url = "https://files.pythonhosted.org/packages/2c/32/7e2ac19b5713657384cec55f89065fb306b06af008cfd87e572035b27119/pillow-11.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:921bd305b10e82b4d1f5e802b6850677f965d8394203d182f078873851dada69", size = 4686296, upload-time = "2025-07-01T09:14:19.828Z" }, + { url = "https://files.pythonhosted.org/packages/8e/1e/b9e12bbe6e4c2220effebc09ea0923a07a6da1e1f1bfbc8d7d29a01ce32b/pillow-11.3.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:eb76541cba2f958032d79d143b98a3a6b3ea87f0959bbe256c0b5e416599fd5d", size = 5871726, upload-time = "2025-07-03T13:10:04.448Z" }, + { url = "https://files.pythonhosted.org/packages/8d/33/e9200d2bd7ba00dc3ddb78df1198a6e80d7669cce6c2bdbeb2530a74ec58/pillow-11.3.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:67172f2944ebba3d4a7b54f2e95c786a3a50c21b88456329314caaa28cda70f6", size = 7644652, upload-time = "2025-07-03T13:10:10.391Z" }, { url = "https://files.pythonhosted.org/packages/41/f1/6f2427a26fc683e00d985bc391bdd76d8dd4e92fac33d841127eb8fb2313/pillow-11.3.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:97f07ed9f56a3b9b5f49d3661dc9607484e85c67e27f3e8be2c7d28ca032fec7", size = 5977787, upload-time = "2025-07-01T09:14:21.63Z" }, { url = "https://files.pythonhosted.org/packages/e4/c9/06dd4a38974e24f932ff5f98ea3c546ce3f8c995d3f0985f8e5ba48bba19/pillow-11.3.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:676b2815362456b5b3216b4fd5bd89d362100dc6f4945154ff172e206a22c024", size = 6645236, upload-time = "2025-07-01T09:14:23.321Z" }, { url = "https://files.pythonhosted.org/packages/40/e7/848f69fb79843b3d91241bad658e9c14f39a32f71a301bcd1d139416d1be/pillow-11.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3e184b2f26ff146363dd07bde8b711833d7b0202e27d13540bfe2e35a323a809", size = 6086950, upload-time = "2025-07-01T09:14:25.237Z" }, @@ -3134,6 +3136,7 @@ dependencies = [ { name = "tqdm" }, { name = "unum" }, { name = "vivarium-core" }, + { name = "vl-convert-python" }, { name = "xmltodict" }, ] @@ -3199,6 +3202,7 @@ requires-dist = [ { name = "tqdm" }, { name = "unum" }, { name = "vivarium-core" }, + { name = "vl-convert-python" }, { name = "xmltodict" }, ] provides-extras = ["dev", "docs"] @@ -3234,6 +3238,19 @@ dependencies = [ ] sdist = { url = "https://files.pythonhosted.org/packages/3e/87/794e0b4c5dccbca3036152fe5df56860a57e70f3e68ac0198dbd7df60fcb/vivarium-core-1.6.5.tar.gz", hash = "sha256:1d83faa60005304b548f623447ab8675a06bb7ed8f6b7c0bd25b4aaa3381fccb", size = 136102, upload-time = "2024-12-03T21:49:29.797Z" } +[[package]] +name = "vl-convert-python" +version = "1.8.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/15/08/06945bff9655c5b0520a8d1b2550cd8007e106ebec45a33840035420e0d2/vl_convert_python-1.8.0.tar.gz", hash = "sha256:ceca613ca5551c55270a15ca48d0f3a7de1e949e0f127310e9b0f6570ea3fbbb", size = 4651586, upload-time = "2025-05-28T00:06:47.6Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/22/5a/9dca7d8ff56e82c298e9ef381cfc803e262b85b7c59f2515d0e9f81a75b6/vl_convert_python-1.8.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:f663317fc280b07553534195c1e31c4ca882d9c8601430211b078196db5ed227", size = 29956698, upload-time = "2025-05-28T00:06:29.533Z" }, + { url = "https://files.pythonhosted.org/packages/42/e2/325e6b5895482b2534e7462c012f237c66ffb02fb3af45eec0accab2f8d4/vl_convert_python-1.8.0-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:81f6380019ceadf070a79f85aa624475a6568093f70de0e151a32e91ecbcaacf", size = 28831173, upload-time = "2025-05-28T00:06:32.925Z" }, + { url = "https://files.pythonhosted.org/packages/09/fa/1dd944c9e9898e59e31c385bdce215aca543acc555de20b8bf4dc60ddb89/vl_convert_python-1.8.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3388e3913287867b3553c10f81ca2d85268216a5a75e7c71b9c1b59887c1977e", size = 31668750, upload-time = "2025-05-28T00:06:36.158Z" }, + { url = "https://files.pythonhosted.org/packages/a7/6b/48f6d47a92eaf6f0dd235146307a7eb0d179b78d2faebc53aca3f1e49177/vl_convert_python-1.8.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b51264998e8fcc43dbce801484a950cfe6513cdc4c46b20604ef50989855a617", size = 32970141, upload-time = "2025-05-28T00:06:41.323Z" }, + { url = "https://files.pythonhosted.org/packages/f8/6f/29dce05f9167e3a01ab74d79eeadd531bc24cf59e3a7fc3736af476ca431/vl_convert_python-1.8.0-cp37-abi3-win_amd64.whl", hash = "sha256:9f1146b791ed27916f54c45e1d66af53a40eb26e5aaea1892f33eb9a935039ab", size = 31318167, upload-time = "2025-05-28T00:06:44.881Z" }, +] + [[package]] name = "wcwidth" version = "0.2.13" From 5b3426c6c05f2e597a06e2c5276af8b237d1c1e4 Mon Sep 17 00:00:00 2001 From: Shu-Feather <2200017797@stu.pku.edu.cn> Date: Wed, 9 Jul 2025 03:03:51 +0800 Subject: [PATCH 48/71] Corret the logic for SQL requirement and plot --- ecoli/analysis/multivariant/cell_mass.py | 33 ++++++++---------------- 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/ecoli/analysis/multivariant/cell_mass.py b/ecoli/analysis/multivariant/cell_mass.py index 9321ea721..cda7ad18f 100644 --- a/ecoli/analysis/multivariant/cell_mass.py +++ b/ecoli/analysis/multivariant/cell_mass.py @@ -30,50 +30,35 @@ def plot( required_columns = [ "time", "variant", + "lineage_seed", "generation", "agent_id", "listeners__mass__dry_mass", + "listeners__mass__dry_mass_fold_change", ] sql = f""" SELECT {", ".join(required_columns)} FROM ({history_sql}) - WHERE agent_id = 0 - ORDER BY variant, generation, time + ORDER BY variant, lineage_seed, generation, time """ df = conn.sql(sql).pl() - # Process time and mass data + # Process time df = df.with_columns( [ (pl.col("time") / 60).alias("time_min"), ] ) - # Calculate initial mass for each generation for normalization - generation_stats = df.group_by(["variant", "generation"]).agg( - [pl.col("listeners__mass__dry_mass").min().alias("initial_mass")] - ) - - # Join back to main dataframe and calculate normalized mass - df = df.join(generation_stats, on=["variant", "generation"], how="left") - - # Calculate normalized mass - df = df.with_columns( - [ - (pl.col("listeners__mass__dry_mass") / pl.col("initial_mass")).alias( - "dry_mass_normalized" - ) - ] - ) - # Get variants and create plots variants = df.select("variant").unique().to_series().to_list() # ----------------------------------------# plots = [] + # Create subplot for each variant for variant in variants: variant_df = df.filter(pl.col("variant") == variant).to_pandas() variant_name = variant_names.get(variant, f"Variant {variant}") @@ -87,6 +72,8 @@ def plot( tooltip_fields: list[str] = ["time_min:Q", "generation:N"] base_encode = { "x": alt.X("time_min:Q", title="Time (min)", scale=alt.Scale(nice=False)), + # Different generations with different colors + # Within same generation, color is the same "color": alt.Color( "generation:N", legend=alt.Legend(title="Generation"), @@ -101,6 +88,7 @@ def plot( x=base_encode["x"], color=base_encode["color"], tooltip=tooltip_fields + ["listeners__mass__dry_mass:Q"], + detail="lineage_seed:N", y=alt.Y( "listeners__mass__dry_mass:Q", title="Dry Mass (fg)", @@ -118,9 +106,10 @@ def plot( .encode( x=base_encode["x"], color=base_encode["color"], - tooltip=tooltip_fields + ["listeners__mass__dry_mass:Q"], + tooltip=tooltip_fields + ["listeners__mass__dry_mass_fold_change:Q"], + detail="lineage_seed:N", y=alt.Y( - "dry_mass_normalized:Q", + "listeners__mass__dry_mass_fold_change:Q", title="Normalized Dry Mass", scale=alt.Scale(nice=False), ), From 84fc31807962ce8cb151f20b65a3d66014fc84ae Mon Sep 17 00:00:00 2001 From: Shu-Feather <2200017797@stu.pku.edu.cn> Date: Wed, 9 Jul 2025 05:01:44 +0800 Subject: [PATCH 49/71] Correct the legend in ribosome_usage plot --- ecoli/analysis/multigeneration/ribosome_usage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ecoli/analysis/multigeneration/ribosome_usage.py b/ecoli/analysis/multigeneration/ribosome_usage.py index b49ae5249..04bcc8a15 100644 --- a/ecoli/analysis/multigeneration/ribosome_usage.py +++ b/ecoli/analysis/multigeneration/ribosome_usage.py @@ -253,7 +253,7 @@ def create_line_chart(y_field, title, y_title, skip_first_point=False): .encode( x=alt.X("time_min:Q", title="Time (min)"), y=alt.Y(f"{y_field}:Q", title=y_title), - color=alt.Color("generation:N", legend=alt.Legend(title="Variant")), + color=alt.Color("generation:N", legend=alt.Legend(title="Generation")), ) .properties(title=title, width=600, height=120) ) From b3ef046580ff33d9434119ef9e77bb8837c8137a Mon Sep 17 00:00:00 2001 From: Sean Cheah Date: Tue, 8 Jul 2025 17:06:57 -0700 Subject: [PATCH 50/71] Safer method of cleaning exponent and base Previously, strip would incorrectly remove - signs and non-decimal trailing zeros --- ecoli/library/parameters.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ecoli/library/parameters.py b/ecoli/library/parameters.py index f13629abe..c3121eaf3 100644 --- a/ecoli/library/parameters.py +++ b/ecoli/library/parameters.py @@ -685,8 +685,9 @@ def main(): value_str = "{:.2e}".format(row.param.value.to(row.units).magnitude) if "e" in value_str: base, exponent = value_str.split("e") - exponent = exponent.strip("+-0") - base = base.strip("0") + exponent = int(exponent) + if "." in base: + base = base.rstrip("0").rstrip(".") if exponent: value_str = "%s \\times 10^{%s}" % (base, exponent) else: From 0d5c79a643023cf6b1d196e60da33a590f7011c6 Mon Sep 17 00:00:00 2001 From: Sean Cheah Date: Tue, 8 Jul 2025 23:48:40 -0700 Subject: [PATCH 51/71] Prevent unintentional overwriting of analysis metadata --- runscripts/analysis.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/runscripts/analysis.py b/runscripts/analysis.py index 6445bba9e..ae03b123b 100644 --- a/runscripts/analysis.py +++ b/runscripts/analysis.py @@ -292,6 +292,17 @@ def main(): } variant_names = {config["experiment_id"][0]: variant_name} + # Save copy of config JSON with parameters for plots + metadata_path = os.path.join(os.path.abspath(config["outdir"]), "metadata.json") + if os.path.exists(metadata_path): + raise FileExistsError( + f"{metadata_path} already exists, indicating an analysis has " + f"been run with output directory {config['outdir']}. Please " + "delete/move it or specify a different output directory." + ) + with open(metadata_path, "w") as f: + json.dump(config, f) + # Establish DuckDB connection conn = create_duckdb_conn(out_uri, gcs_bucket, config.get("cpus")) history_sql, config_sql, success_sql = dataset_sql(out_uri, config["experiment_id"]) @@ -371,12 +382,6 @@ def main(): variant_names, ) - # Save copy of config JSON with parameters for plots - with open( - os.path.join(os.path.abspath(config["outdir"]), "metadata.json"), "w" - ) as f: - json.dump(config, f) - if __name__ == "__main__": main() From 469f9607695b95dd7b4e7189d7f70d52193ccb24 Mon Sep 17 00:00:00 2001 From: Sean Cheah Date: Wed, 9 Jul 2025 14:45:17 -0700 Subject: [PATCH 52/71] Reminder to set shared binary permissions after updating --- doc/hpc.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/hpc.rst b/doc/hpc.rst index e4cc1dcf1..dcbcd9f88 100644 --- a/doc/hpc.rst +++ b/doc/hpc.rst @@ -73,6 +73,8 @@ To run scripts on Sherlock through a SLURM batch script, see :ref:`sherlock-noni 1. Nextflow: ``NXF_EDGE=1 nextflow self-update`` 2. HyperQueue: See :ref:`hq-info`. + Then, reset the permissions of the updated binaries with ``chmod 777 *``. + .. _sherlock-config: Configuration From a78640b22af3758622ce69c04deb0fee976eafbe Mon Sep 17 00:00:00 2001 From: Sean Cheah Date: Wed, 9 Jul 2025 19:49:13 -0700 Subject: [PATCH 53/71] Add cell mass analysis to Jenkins test --- runscripts/jenkins/configs/ecoli-glucose-minimal.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runscripts/jenkins/configs/ecoli-glucose-minimal.json b/runscripts/jenkins/configs/ecoli-glucose-minimal.json index a00e86ffc..747022a3a 100644 --- a/runscripts/jenkins/configs/ecoli-glucose-minimal.json +++ b/runscripts/jenkins/configs/ecoli-glucose-minimal.json @@ -12,7 +12,7 @@ "analysis_options": { "single": {"mass_fraction_summary": {}}, "multiseed": {"protein_counts_validation": {}}, - "multivariant": {"doubling_time_hist": {"skip_n_gens": 0}, "doubling_time_line": {}}, + "multivariant": {"doubling_time_hist": {"skip_n_gens": 0}, "doubling_time_line": {}, "cell_mass": {}}, "multigeneration": {"replication": {}, "ribosome_usage": {}} }, "sherlock": { From ebfd336c8c9fd128c8dea847b65f84b39bdcc951 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 22 Jul 2025 00:30:23 +0000 Subject: [PATCH 54/71] fix(security): update package versions --- uv.lock | 51 ++++++++++++++++++++++++++------------------------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/uv.lock b/uv.lock index 92254fbb4..86433ea97 100644 --- a/uv.lock +++ b/uv.lock @@ -26,7 +26,7 @@ wheels = [ [[package]] name = "aiohttp" -version = "3.12.13" +version = "3.12.14" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiohappyeyeballs" }, @@ -37,37 +37,38 @@ dependencies = [ { name = "propcache" }, { name = "yarl" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/42/6e/ab88e7cb2a4058bed2f7870276454f85a7c56cd6da79349eb314fc7bbcaa/aiohttp-3.12.13.tar.gz", hash = "sha256:47e2da578528264a12e4e3dd8dd72a7289e5f812758fe086473fab037a10fcce", size = 7819160, upload-time = "2025-06-14T15:15:41.354Z" } +sdist = { url = "https://files.pythonhosted.org/packages/e6/0b/e39ad954107ebf213a2325038a3e7a506be3d98e1435e1f82086eec4cde2/aiohttp-3.12.14.tar.gz", hash = "sha256:6e06e120e34d93100de448fd941522e11dafa78ef1a893c179901b7d66aa29f2", size = 7822921, upload-time = "2025-07-10T13:05:33.968Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/b4/6a/ce40e329788013cd190b1d62bbabb2b6a9673ecb6d836298635b939562ef/aiohttp-3.12.13-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:0aa580cf80558557285b49452151b9c69f2fa3ad94c5c9e76e684719a8791b73", size = 700491, upload-time = "2025-06-14T15:14:00.048Z" }, - { url = "https://files.pythonhosted.org/packages/28/d9/7150d5cf9163e05081f1c5c64a0cdf3c32d2f56e2ac95db2a28fe90eca69/aiohttp-3.12.13-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b103a7e414b57e6939cc4dece8e282cfb22043efd0c7298044f6594cf83ab347", size = 475104, upload-time = "2025-06-14T15:14:01.691Z" }, - { url = "https://files.pythonhosted.org/packages/f8/91/d42ba4aed039ce6e449b3e2db694328756c152a79804e64e3da5bc19dffc/aiohttp-3.12.13-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:78f64e748e9e741d2eccff9597d09fb3cd962210e5b5716047cbb646dc8fe06f", size = 467948, upload-time = "2025-06-14T15:14:03.561Z" }, - { url = "https://files.pythonhosted.org/packages/99/3b/06f0a632775946981d7c4e5a865cddb6e8dfdbaed2f56f9ade7bb4a1039b/aiohttp-3.12.13-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:29c955989bf4c696d2ededc6b0ccb85a73623ae6e112439398935362bacfaaf6", size = 1714742, upload-time = "2025-06-14T15:14:05.558Z" }, - { url = "https://files.pythonhosted.org/packages/92/a6/2552eebad9ec5e3581a89256276009e6a974dc0793632796af144df8b740/aiohttp-3.12.13-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:d640191016763fab76072c87d8854a19e8e65d7a6fcfcbf017926bdbbb30a7e5", size = 1697393, upload-time = "2025-06-14T15:14:07.194Z" }, - { url = "https://files.pythonhosted.org/packages/d8/9f/bd08fdde114b3fec7a021381b537b21920cdd2aa29ad48c5dffd8ee314f1/aiohttp-3.12.13-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4dc507481266b410dede95dd9f26c8d6f5a14315372cc48a6e43eac652237d9b", size = 1752486, upload-time = "2025-06-14T15:14:08.808Z" }, - { url = "https://files.pythonhosted.org/packages/f7/e1/affdea8723aec5bd0959171b5490dccd9a91fcc505c8c26c9f1dca73474d/aiohttp-3.12.13-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8a94daa873465d518db073bd95d75f14302e0208a08e8c942b2f3f1c07288a75", size = 1798643, upload-time = "2025-06-14T15:14:10.767Z" }, - { url = "https://files.pythonhosted.org/packages/f3/9d/666d856cc3af3a62ae86393baa3074cc1d591a47d89dc3bf16f6eb2c8d32/aiohttp-3.12.13-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:177f52420cde4ce0bb9425a375d95577fe082cb5721ecb61da3049b55189e4e6", size = 1718082, upload-time = "2025-06-14T15:14:12.38Z" }, - { url = "https://files.pythonhosted.org/packages/f3/ce/3c185293843d17be063dada45efd2712bb6bf6370b37104b4eda908ffdbd/aiohttp-3.12.13-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0f7df1f620ec40f1a7fbcb99ea17d7326ea6996715e78f71a1c9a021e31b96b8", size = 1633884, upload-time = "2025-06-14T15:14:14.415Z" }, - { url = "https://files.pythonhosted.org/packages/3a/5b/f3413f4b238113be35dfd6794e65029250d4b93caa0974ca572217745bdb/aiohttp-3.12.13-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3062d4ad53b36e17796dce1c0d6da0ad27a015c321e663657ba1cc7659cfc710", size = 1694943, upload-time = "2025-06-14T15:14:16.48Z" }, - { url = "https://files.pythonhosted.org/packages/82/c8/0e56e8bf12081faca85d14a6929ad5c1263c146149cd66caa7bc12255b6d/aiohttp-3.12.13-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:8605e22d2a86b8e51ffb5253d9045ea73683d92d47c0b1438e11a359bdb94462", size = 1716398, upload-time = "2025-06-14T15:14:18.589Z" }, - { url = "https://files.pythonhosted.org/packages/ea/f3/33192b4761f7f9b2f7f4281365d925d663629cfaea093a64b658b94fc8e1/aiohttp-3.12.13-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:54fbbe6beafc2820de71ece2198458a711e224e116efefa01b7969f3e2b3ddae", size = 1657051, upload-time = "2025-06-14T15:14:20.223Z" }, - { url = "https://files.pythonhosted.org/packages/5e/0b/26ddd91ca8f84c48452431cb4c5dd9523b13bc0c9766bda468e072ac9e29/aiohttp-3.12.13-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:050bd277dfc3768b606fd4eae79dd58ceda67d8b0b3c565656a89ae34525d15e", size = 1736611, upload-time = "2025-06-14T15:14:21.988Z" }, - { url = "https://files.pythonhosted.org/packages/c3/8d/e04569aae853302648e2c138a680a6a2f02e374c5b6711732b29f1e129cc/aiohttp-3.12.13-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:2637a60910b58f50f22379b6797466c3aa6ae28a6ab6404e09175ce4955b4e6a", size = 1764586, upload-time = "2025-06-14T15:14:23.979Z" }, - { url = "https://files.pythonhosted.org/packages/ac/98/c193c1d1198571d988454e4ed75adc21c55af247a9fda08236602921c8c8/aiohttp-3.12.13-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e986067357550d1aaa21cfe9897fa19e680110551518a5a7cf44e6c5638cb8b5", size = 1724197, upload-time = "2025-06-14T15:14:25.692Z" }, - { url = "https://files.pythonhosted.org/packages/e7/9e/07bb8aa11eec762c6b1ff61575eeeb2657df11ab3d3abfa528d95f3e9337/aiohttp-3.12.13-cp312-cp312-win32.whl", hash = "sha256:ac941a80aeea2aaae2875c9500861a3ba356f9ff17b9cb2dbfb5cbf91baaf5bf", size = 421771, upload-time = "2025-06-14T15:14:27.364Z" }, - { url = "https://files.pythonhosted.org/packages/52/66/3ce877e56ec0813069cdc9607cd979575859c597b6fb9b4182c6d5f31886/aiohttp-3.12.13-cp312-cp312-win_amd64.whl", hash = "sha256:671f41e6146a749b6c81cb7fd07f5a8356d46febdaaaf07b0e774ff04830461e", size = 447869, upload-time = "2025-06-14T15:14:29.05Z" }, + { url = "https://files.pythonhosted.org/packages/c3/0d/29026524e9336e33d9767a1e593ae2b24c2b8b09af7c2bd8193762f76b3e/aiohttp-3.12.14-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:a0ecbb32fc3e69bc25efcda7d28d38e987d007096cbbeed04f14a6662d0eee22", size = 701055, upload-time = "2025-07-10T13:03:45.59Z" }, + { url = "https://files.pythonhosted.org/packages/0a/b8/a5e8e583e6c8c1056f4b012b50a03c77a669c2e9bf012b7cf33d6bc4b141/aiohttp-3.12.14-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:0400f0ca9bb3e0b02f6466421f253797f6384e9845820c8b05e976398ac1d81a", size = 475670, upload-time = "2025-07-10T13:03:47.249Z" }, + { url = "https://files.pythonhosted.org/packages/29/e8/5202890c9e81a4ec2c2808dd90ffe024952e72c061729e1d49917677952f/aiohttp-3.12.14-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a56809fed4c8a830b5cae18454b7464e1529dbf66f71c4772e3cfa9cbec0a1ff", size = 468513, upload-time = "2025-07-10T13:03:49.377Z" }, + { url = "https://files.pythonhosted.org/packages/23/e5/d11db8c23d8923d3484a27468a40737d50f05b05eebbb6288bafcb467356/aiohttp-3.12.14-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27f2e373276e4755691a963e5d11756d093e346119f0627c2d6518208483fb6d", size = 1715309, upload-time = "2025-07-10T13:03:51.556Z" }, + { url = "https://files.pythonhosted.org/packages/53/44/af6879ca0eff7a16b1b650b7ea4a827301737a350a464239e58aa7c387ef/aiohttp-3.12.14-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:ca39e433630e9a16281125ef57ece6817afd1d54c9f1bf32e901f38f16035869", size = 1697961, upload-time = "2025-07-10T13:03:53.511Z" }, + { url = "https://files.pythonhosted.org/packages/bb/94/18457f043399e1ec0e59ad8674c0372f925363059c276a45a1459e17f423/aiohttp-3.12.14-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9c748b3f8b14c77720132b2510a7d9907a03c20ba80f469e58d5dfd90c079a1c", size = 1753055, upload-time = "2025-07-10T13:03:55.368Z" }, + { url = "https://files.pythonhosted.org/packages/26/d9/1d3744dc588fafb50ff8a6226d58f484a2242b5dd93d8038882f55474d41/aiohttp-3.12.14-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f0a568abe1b15ce69d4cc37e23020720423f0728e3cb1f9bcd3f53420ec3bfe7", size = 1799211, upload-time = "2025-07-10T13:03:57.216Z" }, + { url = "https://files.pythonhosted.org/packages/73/12/2530fb2b08773f717ab2d249ca7a982ac66e32187c62d49e2c86c9bba9b4/aiohttp-3.12.14-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9888e60c2c54eaf56704b17feb558c7ed6b7439bca1e07d4818ab878f2083660", size = 1718649, upload-time = "2025-07-10T13:03:59.469Z" }, + { url = "https://files.pythonhosted.org/packages/b9/34/8d6015a729f6571341a311061b578e8b8072ea3656b3d72329fa0faa2c7c/aiohttp-3.12.14-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3006a1dc579b9156de01e7916d38c63dc1ea0679b14627a37edf6151bc530088", size = 1634452, upload-time = "2025-07-10T13:04:01.698Z" }, + { url = "https://files.pythonhosted.org/packages/ff/4b/08b83ea02595a582447aeb0c1986792d0de35fe7a22fb2125d65091cbaf3/aiohttp-3.12.14-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:aa8ec5c15ab80e5501a26719eb48a55f3c567da45c6ea5bb78c52c036b2655c7", size = 1695511, upload-time = "2025-07-10T13:04:04.165Z" }, + { url = "https://files.pythonhosted.org/packages/b5/66/9c7c31037a063eec13ecf1976185c65d1394ded4a5120dd5965e3473cb21/aiohttp-3.12.14-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:39b94e50959aa07844c7fe2206b9f75d63cc3ad1c648aaa755aa257f6f2498a9", size = 1716967, upload-time = "2025-07-10T13:04:06.132Z" }, + { url = "https://files.pythonhosted.org/packages/ba/02/84406e0ad1acb0fb61fd617651ab6de760b2d6a31700904bc0b33bd0894d/aiohttp-3.12.14-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:04c11907492f416dad9885d503fbfc5dcb6768d90cad8639a771922d584609d3", size = 1657620, upload-time = "2025-07-10T13:04:07.944Z" }, + { url = "https://files.pythonhosted.org/packages/07/53/da018f4013a7a179017b9a274b46b9a12cbeb387570f116964f498a6f211/aiohttp-3.12.14-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:88167bd9ab69bb46cee91bd9761db6dfd45b6e76a0438c7e884c3f8160ff21eb", size = 1737179, upload-time = "2025-07-10T13:04:10.182Z" }, + { url = "https://files.pythonhosted.org/packages/49/e8/ca01c5ccfeaafb026d85fa4f43ceb23eb80ea9c1385688db0ef322c751e9/aiohttp-3.12.14-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:791504763f25e8f9f251e4688195e8b455f8820274320204f7eafc467e609425", size = 1765156, upload-time = "2025-07-10T13:04:12.029Z" }, + { url = "https://files.pythonhosted.org/packages/22/32/5501ab525a47ba23c20613e568174d6c63aa09e2caa22cded5c6ea8e3ada/aiohttp-3.12.14-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2785b112346e435dd3a1a67f67713a3fe692d288542f1347ad255683f066d8e0", size = 1724766, upload-time = "2025-07-10T13:04:13.961Z" }, + { url = "https://files.pythonhosted.org/packages/06/af/28e24574801fcf1657945347ee10df3892311c2829b41232be6089e461e7/aiohttp-3.12.14-cp312-cp312-win32.whl", hash = "sha256:15f5f4792c9c999a31d8decf444e79fcfd98497bf98e94284bf390a7bb8c1729", size = 422641, upload-time = "2025-07-10T13:04:16.018Z" }, + { url = "https://files.pythonhosted.org/packages/98/d5/7ac2464aebd2eecac38dbe96148c9eb487679c512449ba5215d233755582/aiohttp-3.12.14-cp312-cp312-win_amd64.whl", hash = "sha256:3b66e1a182879f579b105a80d5c4bd448b91a57e8933564bf41665064796a338", size = 449316, upload-time = "2025-07-10T13:04:18.289Z" }, ] [[package]] name = "aiosignal" -version = "1.3.2" +version = "1.4.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "frozenlist" }, + { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/ba/b5/6d55e80f6d8a08ce22b982eafa278d823b541c925f11ee774b0b9c43473d/aiosignal-1.3.2.tar.gz", hash = "sha256:a8c255c66fafb1e499c9351d0bf32ff2d8a0321595ebac3b93713656d2436f54", size = 19424, upload-time = "2024-12-13T17:10:40.86Z" } +sdist = { url = "https://files.pythonhosted.org/packages/61/62/06741b579156360248d1ec624842ad0edf697050bbaf7c3e46394e106ad1/aiosignal-1.4.0.tar.gz", hash = "sha256:f47eecd9468083c2029cc99945502cb7708b082c232f9aca65da147157b251c7", size = 25007, upload-time = "2025-07-03T22:54:43.528Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/ec/6a/bc7e17a3e87a2985d3e8f4da4cd0f481060eb78fb08596c42be62c90a4d9/aiosignal-1.3.2-py2.py3-none-any.whl", hash = "sha256:45cde58e409a301715980c2b01d0c28bdde3770d8290b5eb2173759d9acb31a5", size = 7597, upload-time = "2024-12-13T17:10:38.469Z" }, + { url = "https://files.pythonhosted.org/packages/fb/76/641ae371508676492379f16e2fa48f4e2c11741bd63c48be4b12a6b09cba/aiosignal-1.4.0-py3-none-any.whl", hash = "sha256:053243f8b92b990551949e63930a839ff0cf0b0ebbe0597b0f3fb19e1a0fe82e", size = 7490, upload-time = "2025-07-03T22:54:42.156Z" }, ] [[package]] @@ -2844,15 +2845,15 @@ wheels = [ [[package]] name = "starlette" -version = "0.47.1" +version = "0.47.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/0a/69/662169fdb92fb96ec3eaee218cf540a629d629c86d7993d9651226a6789b/starlette-0.47.1.tar.gz", hash = "sha256:aef012dd2b6be325ffa16698f9dc533614fb1cebd593a906b90dc1025529a79b", size = 2583072, upload-time = "2025-06-21T04:03:17.337Z" } +sdist = { url = "https://files.pythonhosted.org/packages/04/57/d062573f391d062710d4088fa1369428c38d51460ab6fedff920efef932e/starlette-0.47.2.tar.gz", hash = "sha256:6ae9aa5db235e4846decc1e7b79c4f346adf41e9777aebeb49dfd09bbd7023d8", size = 2583948, upload-time = "2025-07-20T17:31:58.522Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/82/95/38ef0cd7fa11eaba6a99b3c4f5ac948d8bc6ff199aabd327a29cc000840c/starlette-0.47.1-py3-none-any.whl", hash = "sha256:5e11c9f5c7c3f24959edbf2dffdc01bba860228acf657129467d8a7468591527", size = 72747, upload-time = "2025-06-21T04:03:15.705Z" }, + { url = "https://files.pythonhosted.org/packages/f7/1f/b876b1f83aef204198a42dc101613fefccb32258e5428b5f9259677864b4/starlette-0.47.2-py3-none-any.whl", hash = "sha256:c5847e96134e5c5371ee9fac6fdf1a67336d5815e09eb2a01fdb57a351ef915b", size = 72984, upload-time = "2025-07-20T17:31:56.738Z" }, ] [[package]] From ab5f4fb670eb79ffac8308be0dcabda4f713e482 Mon Sep 17 00:00:00 2001 From: Shu-Feather <2200017797@stu.pku.edu.cn> Date: Sat, 2 Aug 2025 15:13:17 +0800 Subject: [PATCH 55/71] Doc. update on experiments and hpc part --- doc/experiments.rst | 6 + doc/hpc.rst | 293 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 299 insertions(+) diff --git a/doc/experiments.rst b/doc/experiments.rst index cdc1c7efb..69f7ae28f 100644 --- a/doc/experiments.rst +++ b/doc/experiments.rst @@ -134,7 +134,13 @@ documented in :ref:`/workflows.rst`. # simulations run using ecoli/experiments/ecoli_master_sim.py. Workflows # run with runscripts/workflow.py generate initial seeds using the value # of a different configuration option named "lineage_seed". + # Both seed and lineage_seed are supposed to be integers. "seed": 0, + # Number of initial simulations for each specific seed. + # Specifically, For example, if you pass in seed = 100 and n_init_sims = 5, + # then Nextflow channel will receive [100, 101, 102, 103, 104], + # and you can view each channel as a separate lineage. + "n_init_sims": 1, # Special flags to enable mechanisms related to antibiotic resistance. # See API documentation for ecoli.library.sim_data.LoadSimData for more # information. diff --git a/doc/hpc.rst b/doc/hpc.rst index dcbcd9f88..93d44831e 100644 --- a/doc/hpc.rst +++ b/doc/hpc.rst @@ -293,6 +293,299 @@ cloned repository and not the snapshot, add the ``-d`` flag and drop the ``/vEcoli/`` prefix from script names. Note that changing files in your cloned repository may affect SLURM batch jobs submitted with this flag. +.. _sherlock-Quick-Start: + +Quick Start +=========== + +Initial Setup +^^^^^^^^^^^^^ + +Request a Sherlock Account +-------------------------- + +If you've never had a Sherlock account: Go to https://www.sherlock.stanford.edu/ and click on ``Request an Account`` + +.. note:: + Markus will have to approve this. + +If you've had a Sherlock account for a previous group: Email srcc-support@stanford.edu and ask them to move your account to mcovert, and CC Markus on the email and in the email body ask for Markus to give approval + +Additional Resources: Sherlock Documentation from Stanford +---------------------------------------------------------- + +* https://srcc.stanford.edu/workshops/sherlock-boarding-session +* https://www.sherlock.stanford.edu/docs/ + +Login to Sherlock +^^^^^^^^^^^^^^^^^ + +.. code-block:: bash + + ssh @login.sherlock.stanford.edu + # Type in Stanford Password + # Do the Duo authentication + # The following setup steps should be done using the Sherlock terminal + # NOTE that this is a LOGIN node, so no major computing should be done here + + # It is best to use a compute node for things like cloning the repo, running code, resetting lpad, etc + + srun -p mcovert --time=4:00:00 --cpus-per-task=2 --pty bash + + # srun is the command for launching a job step under Slurm + # -p or --partition specifies which partition (queue) to use, choose covert :D + # --time: sets the job's wall‑clock time limit + # --cpus-per-task specifies # CPU cores for each task in this job step + # --pty: allocates a pseudo‑terminal (TTY) to run an interactive session + # bash: launching a Bash shell + # When it finished, usually you can see your JOB ID in your shell + + # You can use scancel to abort your job step + scancel + +You can also refer to the Sherlock Documentation: https://www.sherlock.stanford.edu/docs/getting-started/connecting/ + +Clone the vEcoli Repository +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +1. Git clone the vEcoli repo to your Sherlock account: + +.. code-block:: bash + + git clone https://github.com/CovertLab/vEcoli.git + +If you have already created your branch, you can use: + +.. code-block:: bash + + # View all the branches (including remote branch) + git branch -a + + # Checkout to your own branch + git checkout + + # Validate your current branch + git branch + +2. Set up your ``vEcoli`` based on the detailed tutorial at ``Set Up`` chapter of Sherlock. + +.. tip:: + * You can use ``nano`` as text editor: + + .. code-block:: bash + + nano ~/.bash_profile + # After writing, you can use Ctrl+O to write out, Enter to confirm, and Ctrl+X to exit + + * If you choose to use ``vim``, press ``i`` for insert, and press ``Esc``, then type ``:wq`` and Enter for writing out + * Before running the ``python3`` to set up the env, ensure you are in the vEcoli repo + * It usually takes time to run first job + +Submit Your Job +^^^^^^^^^^^^^^^ + +Module Loading +-------------- + +For the time we login Sherlock, first we can use ``module load`` to load crucial tools for experiments: + +.. code-block:: bash + + # Load newer Git, Java (for nextflow), and Python + module load system git java/21.0.4 python/3.12.1 + # Include shared Nextflow and HyperQueue installations on PATH + export PATH=$PATH:$GROUP_HOME/vEcoli_env + +.. note:: + This only needs to be done once. + +Configuration Setup +------------------- + +1. Before running your job, you should refer to the tutorial to construct the config for Sherlock + +.. important:: + Since ``$HOME`` only has a pretty small storage limit (run ``sh_quota`` to view), it is **highly recommended** to use ``$SCRATCH`` as your ``emitter_arg`` instead (like: ``"out_dir": "/scratch/users//out"``). + +2. With configuration files, a workflow for vEcoli can be started with: + +.. code-block:: bash + + python3 runscripts/workflow.py --config + +If ``build_image`` is true in your config JSON, the terminal will report that a **SLURM job** was submitted to build the container image. +When the image build job starts, the terminal will report the build progress. + +.. note:: + * Remember to use ``python3`` instead of ``python`` + * This command is supposed to run on **login node**, which means there is no need to use ``srun`` to request a **compute node** + * If there is trouble with permission denied for nextflow (you can use ``nextflow -version`` to check out), you can try ``chmod a+rwx`` + +Job Monitoring +-------------- + +3. Once the build has finished, the terminal will report that a **SLURM job** was submitted for the Nextflow workflow orchestrator before exiting back to the shell. +At this point, you are free to close your connection, start additional workflows, etc. + +.. note:: + Unlike workflows run locally, Sherlock's containerized workflows mean any changes made to the repository after the container image has been built will not affect the running workflow. + +4. You can use ``squeue`` to view the status of your job: + +.. code-block:: bash + + # View by job + squeue -j + # View by user + squeue -u + +5. Again, you can start additional, concurrent workflows that each build a new image with different modifications to the cloned repo. +By setting ``build_image`` to ``false`` and ``container_image`` to the path of previously saved image, you can save time by reusing a previously built image. + +Debug or Perform Analysis +------------------------- + +It's recommended to use ``Interactive Container`` +(you can view more details in the tutorial at interactive-container). + +Before building up any containers, first you should have a basic ``Image`` for vEcoli. + +For example, we can use ``test_sherlock.json`` as config for both checking the project and building up a basic image: + +.. code-block:: bash + + # At head node: + python3 runscripts/workflow.py --config configs/test_sherlock.json + +By default, the image file is at ``vEcoli/test_sherlock/test_image``. + +Then, you can use: + +.. code-block:: bash + + runscripts/container/interactive.sh -i -a + +to build an interactive image. + +Inside the image, you can just use ``python`` commands rather than ``uv``. +For example, you can do further analysis for your simulation results: + +.. code-block:: bash + + python3 runscripts/analysis.py --config + +Moreover, if you want to exit the image, just use ``exit`` command. + +Interactive vs Non-Interactive Containers +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Interactive Container +---------------------- + +1. Start an interactive container with your full image path: + +.. code-block:: bash + + runscripts/container/interactive.sh -i -a + +.. note:: + In this way any changes that you make to ``/vEcoli`` inside the container are **discarded** when the container terminates, + and ``~`` and any environment variables like ``$SCRATCH`` do not work inside the container. + +If you want to start an interactive container that **reflects the current state** of your cloned repository, +navigate to your cloned repository and run the above command with the ``-d`` flag to start a "development" container: + +.. code-block:: bash + + runscripts/container/interactive.sh -i -a -d + +In this mode, instead of editing source files in ``/vEcoli``, +you can directly edit the source files in your cloned repository and have those changes immediately reflected when running those scripts inside the container. + +.. note:: + Any changes you make will **persist** after the container terminates and can be tracked using Git version control. + +For more detailed information, please refer to the tutorial interactive-container + +Non-Interactive Container +-------------------------- + +To run any script inside a container with a non-interactive session, use the same command as **Interactive Container** but specify a command using the ``-c`` flag, for example: + +.. code-block:: bash + + runscripts/container/interactive.sh -i -c "python /vEcoli/runscripts/parca.py --config " + +Manual Script Execution with sbatch +------------------------------------ + +If you want to manually run scripts, you can use ``sbatch``. + +1. First, you should write your own Batch scripts: https://www.sherlock.stanford.edu/docs/getting-started/submitting/#batch-scripts + +Following is a sample for sbatch scripts: + +.. code-block:: bash + + #!/usr/bin/bash + #SBATCH --job-name=analysis_job + #SBATCH --output=analysis_job.%j.out + #SBATCH --error=analysis_job.%j.err + #SBATCH --time=20:00 + #SBATCH --ntasks=1 + #SBATCH --partition=owners,normal + #SBATCH --cpus-per-task 4 + #SBATCH --mem=8GB + + python3 runscripts/analysis.py --config + +2. Second, use ``sbatch`` to submit the job: + +.. code-block:: bash + + sbatch .sh + +You can use ``squeue`` to check your job status. Moreover, you can list the contents of the output file with the following commands: + +.. code-block:: bash + + cat slurm-.out + +Download Results to Local +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``SCP`` is convenient for downloading files from the cluster. You can simply execute the following on your **local terminal**: + +.. code-block:: bash + + # -r for recursively duplicate the whole repo: + scp -r @login.sherlock.stanford.edu:/path/to/remote/folder /path/to/local/destination + + # If you only want to download single file: + scp @login.sherlock.stanford.edu:/path/to/remote/file /path/to/local/destination/ + +This will require your password and Duo validation. + +In practice, usually we want to get the analytical results for our simulation. Due to the report files being HTML files typically, we can turn to shell wildcard and use ``rsync`` with ``include/exclude`` filters: + +.. code-block:: bash + + # Recursively downloads all .html files under the specific directory on Sherlock + # to your local machine while preserving the subdirectory structure: + + rsync -av --prune-empty-dirs \ + --include='*/' --include='*.html' --exclude='*' \ + @login.sherlock.stanford.edu:/path/to/remote/folder /path/to/local/destination + + # --include='*/': Keeps all directories, allowing rsync to traverse into subdirectories + # --include='*.html': Includes only .html files + # --exclude='*': Excludes everything else + # -a: Archive mode (preserves metadata) + # -v: Verbose output + # --prune-empty-dirs: Avoids creating empty directories on the local machine + +This will also require your password and Duo validation. + .. _other-cluster: -------------- From 4971f7d68f5f1faa0c6c98aa9d65ac5108d465ca Mon Sep 17 00:00:00 2001 From: Shu-Feather <2200017797@stu.pku.edu.cn> Date: Wed, 6 Aug 2025 06:11:54 +0800 Subject: [PATCH 56/71] Modification for hpc documentation --- doc/experiments.rst | 5 - doc/hpc.rst | 385 +++++++++++++++----------------------------- 2 files changed, 130 insertions(+), 260 deletions(-) diff --git a/doc/experiments.rst b/doc/experiments.rst index 69f7ae28f..60846e59a 100644 --- a/doc/experiments.rst +++ b/doc/experiments.rst @@ -136,11 +136,6 @@ documented in :ref:`/workflows.rst`. # of a different configuration option named "lineage_seed". # Both seed and lineage_seed are supposed to be integers. "seed": 0, - # Number of initial simulations for each specific seed. - # Specifically, For example, if you pass in seed = 100 and n_init_sims = 5, - # then Nextflow channel will receive [100, 101, 102, 103, 104], - # and you can view each channel as a separate lineage. - "n_init_sims": 1, # Special flags to enable mechanisms related to antibiotic resistance. # See API documentation for ecoli.library.sim_data.LoadSimData for more # information. diff --git a/doc/hpc.rst b/doc/hpc.rst index 93d44831e..85346b483 100644 --- a/doc/hpc.rst +++ b/doc/hpc.rst @@ -32,6 +32,72 @@ Setup .. note:: The following setup applies to members of the Covert Lab only. +Request a Sherlock Account +-------------------------- + +If you've never had a Sherlock account: Go to https://www.sherlock.stanford.edu/ and click on ``Request an Account`` + +.. note:: + Markus will have to approve this. + +If you've had a Sherlock account for a previous group: Email srcc-support@stanford.edu and ask them to move your account to mcovert, and CC Markus on the email and in the email body ask for Markus to give approval + +Additional Resources: Sherlock Documentation from Stanford +---------------------------------------------------------- + +* https://srcc.stanford.edu/workshops/sherlock-boarding-session +* https://www.sherlock.stanford.edu/docs/ + +Login to Sherlock +----------------- + +.. code-block:: bash + + ssh @login.sherlock.stanford.edu + # Type in Stanford Password + # Do the Duo authentication + # The following setup steps should be done using the Sherlock terminal + # NOTE that this is a LOGIN node, so no major computing should be done here + + # It is best to use a compute node for things like cloning the repo, running code, resetting lpad, etc + + srun -p mcovert --time=4:00:00 --cpus-per-task=2 --pty bash + + # srun is the command for launching a job step under Slurm + # -p or --partition specifies which partition (queue) to use, choose covert :D + # --time: sets the job's wall‑clock time limit + # --cpus-per-task specifies # CPU cores for each task in this job step + # --pty: allocates a pseudo‑terminal (TTY) to run an interactive session + # bash: launching a Bash shell + # When it finished, usually you can see your JOB ID in your shell + + # You can use scancel to abort your job step + scancel + +You can also refer to the Sherlock Documentation: https://www.sherlock.stanford.edu/docs/getting-started/connecting/ + +Clone the vEcoli Repository +---------------------------- + +1. Git clone the vEcoli repo to your Sherlock account: + +.. code-block:: bash + + git clone https://github.com/CovertLab/vEcoli.git + +If you have already created your branch, you can use: + +.. code-block:: bash + + # View all the branches (including remote branch) + git branch -a + + # Checkout to your own branch + git checkout + + # Validate your current branch + git branch + After cloning the model repository to your home directory, add the following lines to your ``~/.bash_profile``, then close and reopen your SSH connection: @@ -65,6 +131,18 @@ a workflow on Sherlock. To run scripts on Sherlock outside a workflow, see :ref:`sherlock-interactive`. To run scripts on Sherlock through a SLURM batch script, see :ref:`sherlock-noninteractive`. +.. tip:: + * You can use ``nano`` as text editor: + + .. code-block:: bash + + nano ~/.bash_profile + # After writing, you can use Ctrl+O to write out, Enter to confirm, and Ctrl+X to exit + + * If you choose to use ``vim``, press ``i`` for insert, and press ``Esc``, then type ``:wq`` and Enter for writing out + * Before running the ``python3`` to set up the env, ensure you are in the vEcoli repo + * It usually takes time to run first job + .. note:: The above setup is sufficient to run workflows on Sherlock. However, if you have a compelling reason to update the shared Nextflow or HyperQueue binaries, @@ -75,6 +153,15 @@ To run scripts on Sherlock through a SLURM batch script, see :ref:`sherlock-noni Then, reset the permissions of the updated binaries with ``chmod 777 *``. +.. warning:: + + Before building your own config file and running an experiment, remember: + + Python scripts (other than runscripts/workflow.py) **WILL NOT** run on Sherlock directly. + This includes the standalone ParCa, simulation, and analysis run scripts. + Instead, these scripts can be run inside an :ref:`sherlock-interactive` (ideal for script development or debugging) + or :ref:`sherlock-noninteractive` (ideal for longer or more resource-intensive scripts that do not require user input). + .. _sherlock-config: Configuration @@ -106,8 +193,11 @@ keys in your configuration JSON (note the top-level ``sherlock`` key): In addition to these options, you **MUST** set the emitter output directory (see description of ``emitter_arg`` in :ref:`json_config`) to a path with -enough space to store your workflow outputs. We recommend setting this to -a location in your ``$SCRATCH`` directory (e.g. ``/scratch/users/{username}/out``). +enough space to store your workflow outputs. + +.. important:: + We recommend setting ``emitter_arg`` to a location in your ``$SCRATCH`` directory (e.g. ``"out_dir": "/scratch/users/{username}/out"``), + since ``$HOME`` only has a pretty small storage limit (run ``sh_quota`` to view). If using the Parquet emitter and ``threaded`` is not set to false under ``emitter_arg``, a warning will be printed suggesting that you set ``threaded`` @@ -138,8 +228,9 @@ in the path to your config JSON. .. warning:: Remember to use ``python3`` to start workflows instead of ``python``. + This command is supposed to run on **login node**, which means there is no need to use ``srun`` to request a **compute node**. + If there is trouble with permission denied for nextflow (you can use ``nextflow -version`` to check out), you can try ``chmod a+rwx`` -This command should be run on a login node (no need to request a compute node). If ``build_image`` is true in your config JSON, the terminal will report that a SLURM job was submitted to build the container image. When the image build job starts, the terminal will report the build progress. @@ -151,10 +242,19 @@ job starts, the terminal will report the build progress. Do not make any changes to your cloned repository or close your SSH connection until the build has finished. -Once the build has finished, the terminal will report that a SLURM job +Once the build has finished, the terminal will report that a **SLURM job** was submitted for the Nextflow workflow orchestrator before exiting back to the shell. At this point, you are free to close your connection, -start additional workflows, etc. Unlike workflows run locally, Sherlock's +start additional workflows, etc. You can use ``squeue`` to view the status of your SLURM job: + +.. code-block:: bash + + # View by job + squeue -j + # View by user + squeue -u + +Unlike workflows run locally, Sherlock's containerized workflows mean any changes made to the repository after the container image has been built will not affect the running workflow. @@ -237,6 +337,8 @@ More specifically, users who wish to debug a failed workflow job should: Any changes that you make to ``/vEcoli`` inside the container are discarded when the container terminates. +Moreover, if you want to exit the interactive image, just type ``exit`` command. + To start an interactive container that reflects the current state of your cloned repository, navigate to your cloned repository and run the above command with the ``-d`` flag to start a "development" container: @@ -288,242 +390,7 @@ to include one of the following directives at the top of your script: - ``#SBATCH --partition=owners,normal``: Uses either the ``owners`` or ``normal`` partition. This is the recommended option for the vast majority of scripts. -Just as with interactive containers, to run scripts directly from your -cloned repository and not the snapshot, add the ``-d`` flag and drop the -``/vEcoli/`` prefix from script names. Note that changing files in your -cloned repository may affect SLURM batch jobs submitted with this flag. - -.. _sherlock-Quick-Start: - -Quick Start -=========== - -Initial Setup -^^^^^^^^^^^^^ - -Request a Sherlock Account --------------------------- - -If you've never had a Sherlock account: Go to https://www.sherlock.stanford.edu/ and click on ``Request an Account`` - -.. note:: - Markus will have to approve this. - -If you've had a Sherlock account for a previous group: Email srcc-support@stanford.edu and ask them to move your account to mcovert, and CC Markus on the email and in the email body ask for Markus to give approval - -Additional Resources: Sherlock Documentation from Stanford ----------------------------------------------------------- - -* https://srcc.stanford.edu/workshops/sherlock-boarding-session -* https://www.sherlock.stanford.edu/docs/ - -Login to Sherlock -^^^^^^^^^^^^^^^^^ - -.. code-block:: bash - - ssh @login.sherlock.stanford.edu - # Type in Stanford Password - # Do the Duo authentication - # The following setup steps should be done using the Sherlock terminal - # NOTE that this is a LOGIN node, so no major computing should be done here - - # It is best to use a compute node for things like cloning the repo, running code, resetting lpad, etc - - srun -p mcovert --time=4:00:00 --cpus-per-task=2 --pty bash - - # srun is the command for launching a job step under Slurm - # -p or --partition specifies which partition (queue) to use, choose covert :D - # --time: sets the job's wall‑clock time limit - # --cpus-per-task specifies # CPU cores for each task in this job step - # --pty: allocates a pseudo‑terminal (TTY) to run an interactive session - # bash: launching a Bash shell - # When it finished, usually you can see your JOB ID in your shell - - # You can use scancel to abort your job step - scancel - -You can also refer to the Sherlock Documentation: https://www.sherlock.stanford.edu/docs/getting-started/connecting/ - -Clone the vEcoli Repository -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -1. Git clone the vEcoli repo to your Sherlock account: - -.. code-block:: bash - - git clone https://github.com/CovertLab/vEcoli.git - -If you have already created your branch, you can use: - -.. code-block:: bash - - # View all the branches (including remote branch) - git branch -a - - # Checkout to your own branch - git checkout - - # Validate your current branch - git branch - -2. Set up your ``vEcoli`` based on the detailed tutorial at ``Set Up`` chapter of Sherlock. - -.. tip:: - * You can use ``nano`` as text editor: - - .. code-block:: bash - - nano ~/.bash_profile - # After writing, you can use Ctrl+O to write out, Enter to confirm, and Ctrl+X to exit - - * If you choose to use ``vim``, press ``i`` for insert, and press ``Esc``, then type ``:wq`` and Enter for writing out - * Before running the ``python3`` to set up the env, ensure you are in the vEcoli repo - * It usually takes time to run first job - -Submit Your Job -^^^^^^^^^^^^^^^ - -Module Loading --------------- - -For the time we login Sherlock, first we can use ``module load`` to load crucial tools for experiments: - -.. code-block:: bash - - # Load newer Git, Java (for nextflow), and Python - module load system git java/21.0.4 python/3.12.1 - # Include shared Nextflow and HyperQueue installations on PATH - export PATH=$PATH:$GROUP_HOME/vEcoli_env - -.. note:: - This only needs to be done once. - -Configuration Setup -------------------- - -1. Before running your job, you should refer to the tutorial to construct the config for Sherlock - -.. important:: - Since ``$HOME`` only has a pretty small storage limit (run ``sh_quota`` to view), it is **highly recommended** to use ``$SCRATCH`` as your ``emitter_arg`` instead (like: ``"out_dir": "/scratch/users//out"``). - -2. With configuration files, a workflow for vEcoli can be started with: - -.. code-block:: bash - - python3 runscripts/workflow.py --config - -If ``build_image`` is true in your config JSON, the terminal will report that a **SLURM job** was submitted to build the container image. -When the image build job starts, the terminal will report the build progress. - -.. note:: - * Remember to use ``python3`` instead of ``python`` - * This command is supposed to run on **login node**, which means there is no need to use ``srun`` to request a **compute node** - * If there is trouble with permission denied for nextflow (you can use ``nextflow -version`` to check out), you can try ``chmod a+rwx`` - -Job Monitoring --------------- - -3. Once the build has finished, the terminal will report that a **SLURM job** was submitted for the Nextflow workflow orchestrator before exiting back to the shell. -At this point, you are free to close your connection, start additional workflows, etc. - -.. note:: - Unlike workflows run locally, Sherlock's containerized workflows mean any changes made to the repository after the container image has been built will not affect the running workflow. - -4. You can use ``squeue`` to view the status of your job: - -.. code-block:: bash - - # View by job - squeue -j - # View by user - squeue -u - -5. Again, you can start additional, concurrent workflows that each build a new image with different modifications to the cloned repo. -By setting ``build_image`` to ``false`` and ``container_image`` to the path of previously saved image, you can save time by reusing a previously built image. - -Debug or Perform Analysis -------------------------- - -It's recommended to use ``Interactive Container`` -(you can view more details in the tutorial at interactive-container). - -Before building up any containers, first you should have a basic ``Image`` for vEcoli. - -For example, we can use ``test_sherlock.json`` as config for both checking the project and building up a basic image: - -.. code-block:: bash - - # At head node: - python3 runscripts/workflow.py --config configs/test_sherlock.json - -By default, the image file is at ``vEcoli/test_sherlock/test_image``. - -Then, you can use: - -.. code-block:: bash - - runscripts/container/interactive.sh -i -a - -to build an interactive image. - -Inside the image, you can just use ``python`` commands rather than ``uv``. -For example, you can do further analysis for your simulation results: - -.. code-block:: bash - - python3 runscripts/analysis.py --config - -Moreover, if you want to exit the image, just use ``exit`` command. - -Interactive vs Non-Interactive Containers -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Interactive Container ----------------------- - -1. Start an interactive container with your full image path: - -.. code-block:: bash - - runscripts/container/interactive.sh -i -a - -.. note:: - In this way any changes that you make to ``/vEcoli`` inside the container are **discarded** when the container terminates, - and ``~`` and any environment variables like ``$SCRATCH`` do not work inside the container. - -If you want to start an interactive container that **reflects the current state** of your cloned repository, -navigate to your cloned repository and run the above command with the ``-d`` flag to start a "development" container: - -.. code-block:: bash - - runscripts/container/interactive.sh -i -a -d - -In this mode, instead of editing source files in ``/vEcoli``, -you can directly edit the source files in your cloned repository and have those changes immediately reflected when running those scripts inside the container. - -.. note:: - Any changes you make will **persist** after the container terminates and can be tracked using Git version control. - -For more detailed information, please refer to the tutorial interactive-container - -Non-Interactive Container --------------------------- - -To run any script inside a container with a non-interactive session, use the same command as **Interactive Container** but specify a command using the ``-c`` flag, for example: - -.. code-block:: bash - - runscripts/container/interactive.sh -i -c "python /vEcoli/runscripts/parca.py --config " - -Manual Script Execution with sbatch ------------------------------------- - -If you want to manually run scripts, you can use ``sbatch``. - -1. First, you should write your own Batch scripts: https://www.sherlock.stanford.edu/docs/getting-started/submitting/#batch-scripts - -Following is a sample for sbatch scripts: +Following is a sample of sbatch scripts for requiring more resources to analysis simulation results: .. code-block:: bash @@ -534,27 +401,36 @@ Following is a sample for sbatch scripts: #SBATCH --time=20:00 #SBATCH --ntasks=1 #SBATCH --partition=owners,normal - #SBATCH --cpus-per-task 4 - #SBATCH --mem=8GB + #SBATCH --cpus-per-task=4 + #SBATCH --mem=64GB - python3 runscripts/analysis.py --config + srun runscripts/container/interactive.sh -i -a -c "python runscripts/analysis.py --config " -2. Second, use ``sbatch`` to submit the job: +Then, use ``sbatch`` to submit the job: .. code-block:: bash sbatch .sh -You can use ``squeue`` to check your job status. Moreover, you can list the contents of the output file with the following commands: +The ``.err`` and ``.out`` files will be created in the same directory as the sbatch script. -.. code-block:: bash +Just as with interactive containers, to run scripts directly from your +cloned repository and not the snapshot, add the ``-d`` flag and drop the +``/vEcoli/`` prefix from script names. Note that changing files in your +cloned repository may affect SLURM batch jobs submitted with this flag. + +.. _Download Results to Local from Sherlock: - cat slurm-.out +Download Results to Local from Sherlock +==================================== -Download Results to Local -^^^^^^^^^^^^^^^^^^^^^^^^^^ +It's recommended to turn to +`Sherlock's Data Transfer documentation `_ +for details on transferring files to and from your local machine. -``SCP`` is convenient for downloading files from the cluster. You can simply execute the following on your **local terminal**: +Following are common methods ``scp`` and ``rsync``: + +``scp`` is convenient for downloading files from the cluster. You can simply execute the following on your **local terminal**: .. code-block:: bash @@ -564,9 +440,8 @@ Download Results to Local # If you only want to download single file: scp @login.sherlock.stanford.edu:/path/to/remote/file /path/to/local/destination/ -This will require your password and Duo validation. - -In practice, usually we want to get the analytical results for our simulation. Due to the report files being HTML files typically, we can turn to shell wildcard and use ``rsync`` with ``include/exclude`` filters: +In practice, usually we want to get the analytical results for our simulation. +Due to the report files being HTML files typically, we can turn to shell wildcard and use ``rsync`` with ``include/exclude`` filters: .. code-block:: bash @@ -584,7 +459,7 @@ In practice, usually we want to get the analytical results for our simulation. D # -v: Verbose output # --prune-empty-dirs: Avoids creating empty directories on the local machine -This will also require your password and Duo validation. +Both ``scp`` and ``rsync`` will require your password and Duo validation. .. _other-cluster: From 68b0ef1bbee32437946633c21060b687a67d3636 Mon Sep 17 00:00:00 2001 From: annabellefowler Date: Fri, 22 Aug 2025 14:39:18 -0700 Subject: [PATCH 57/71] small edit to documentation --- doc/hpc.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/hpc.rst b/doc/hpc.rst index 85346b483..d96c08553 100644 --- a/doc/hpc.rst +++ b/doc/hpc.rst @@ -238,6 +238,10 @@ job starts, the terminal will report the build progress. .. note:: Files that match the patterns in ``.dockerignore`` are excluded from the image. +.. note:: + If the Apptainer build fails, eg: + ``FATAL: While performing build: conveyor failed to get: unexpected end of JSON input``, + try cleaning cache: ``apptainer cache clean`` .. warning:: Do not make any changes to your cloned repository or close your SSH connection until the build has finished. From db73593633577fc7a34fabdeda1e5bcbb9f78fca Mon Sep 17 00:00:00 2001 From: Sean Cheah Date: Tue, 5 Aug 2025 16:44:26 -0700 Subject: [PATCH 58/71] Create archive of files to include in Apptainer image Gets around 64K char limit on each section in Apptainer definition file --- runscripts/container/Singularity | 7 +++-- runscripts/container/build-image.sh | 44 +++++++++++------------------ 2 files changed, 21 insertions(+), 30 deletions(-) diff --git a/runscripts/container/Singularity b/runscripts/container/Singularity index 6df713271..a7adefc05 100644 --- a/runscripts/container/Singularity +++ b/runscripts/container/Singularity @@ -22,11 +22,12 @@ From: ghcr.io/astral-sh/uv@sha256:1cc0392c8aad8026ef3922e3f997fff0f31e506b0ffe95 website "https://www.covert.stanford.edu/" %files - # runscripts/container/build-image.sh has some custom logic to replace this - # with a set of files that honors .dockerignore - FILES_TO_ADD + repo.tar /repo.tar %post + mkdir /vEcoli + tar -xf /repo.tar -C /vEcoli + rm /repo.tar apt-get update && apt-get install -y gcc procps nano curl cd /vEcoli UV_CACHE_DIR="/vEcoli/.uv_cache" UV_COMPILE_BYTECODE=1 uv sync --frozen diff --git a/runscripts/container/build-image.sh b/runscripts/container/build-image.sh index 663a5c110..7d29e66d3 100755 --- a/runscripts/container/build-image.sh +++ b/runscripts/container/build-image.sh @@ -94,21 +94,19 @@ elif [ "$BUILD_APPTAINER" -ne 0 ]; then if [ -f "$ignore_file" ]; then echo "Processing patterns from $ignore_file" grep -v "^#" "$ignore_file" | grep -v "^$" | grep -v "^!" | while read -r pattern; do - # Handle patterns starting with / (root-relative) - if [[ "$pattern" == /* ]]; then - echo ".${pattern}" >>"$EXCLUDE_PATTERNS" - echo ".${pattern}/*" >>"$EXCLUDE_PATTERNS" - # Handle directory patterns ending with / - elif [[ "$pattern" == */ ]]; then - echo "./${pattern}*" >>"$EXCLUDE_PATTERNS" - echo "./*/${pattern}*" >>"$EXCLUDE_PATTERNS" - # Handle other patterns - else - echo "./*/${pattern}" >>"$EXCLUDE_PATTERNS" - echo "./${pattern}" >>"$EXCLUDE_PATTERNS" - echo "./${pattern}/*" >>"$EXCLUDE_PATTERNS" - echo "./*/${pattern}/*" >>"$EXCLUDE_PATTERNS" - fi + # Handle patterns starting with / (root-relative) + if [[ "$pattern" == /* ]]; then + echo ".${pattern}" >>"$EXCLUDE_PATTERNS" + echo ".${pattern}/*" >>"$EXCLUDE_PATTERNS" + # Handle directory patterns ending with / + elif [[ "$pattern" == */ ]]; then + echo "./${pattern}" >>"$EXCLUDE_PATTERNS" + echo "./${pattern}*" >>"$EXCLUDE_PATTERNS" + # Handle other patterns + else + echo "./${pattern}" >>"$EXCLUDE_PATTERNS" + echo "./${pattern}/*" >>"$EXCLUDE_PATTERNS" + fi done fi } @@ -123,16 +121,13 @@ elif [ "$BUILD_APPTAINER" -ne 0 ]; then FIND_CMD="$FIND_CMD ! -path \"$pattern\"" done <"$EXCLUDE_PATTERNS" - # Create a temporary file for our list of files - TEMP_FILES_LIST=$(mktemp) - TEMP_FILES+=("$TEMP_FILES_LIST") - echo "Executing: $FIND_CMD" # Execute the dynamically generated find command - eval "$FIND_CMD" >"$TEMP_FILES_LIST" + eval "$FIND_CMD -print0 | xargs -0 tar -cvf repo.tar" # Debug output - echo "Generated $(wc -l <"$TEMP_FILES_LIST") files to include in the image" + echo "Found $(du -sh repo.tar) of files to include in the image" + TEMP_FILES+=("repo.tar") # Initialize environment variables string DOT_ENV_VARS="" @@ -156,12 +151,7 @@ elif [ "$BUILD_APPTAINER" -ne 0 ]; then # Read the Singularity file line by line while IFS= read -r line; do - if [[ "$line" == *"FILES_TO_ADD"* ]]; then - # For the line containing FILES_TO_ADD, replace with formatted file paths - while IFS= read -r file; do - echo " $file /vEcoli/$file" >>"$TEMP_DEF" - done <"$TEMP_FILES_LIST" - elif [[ "$line" == *"DOT_ENV_VARS"* ]]; then + if [[ "$line" == *"DOT_ENV_VARS"* ]]; then echo "$DOT_ENV_VARS" >> "$TEMP_DEF" else # Otherwise just add the line as-is From ac5342c168b765c172c674fc07bfde9d71a26d2b Mon Sep 17 00:00:00 2001 From: Sean Cheah Date: Tue, 5 Aug 2025 16:45:25 -0700 Subject: [PATCH 59/71] Try substituting Apptainer env vars --- runscripts/container/Singularity | 2 +- runscripts/container/build-image.sh | 14 ++------------ 2 files changed, 3 insertions(+), 13 deletions(-) diff --git a/runscripts/container/Singularity b/runscripts/container/Singularity index a7adefc05..33767e941 100644 --- a/runscripts/container/Singularity +++ b/runscripts/container/Singularity @@ -12,7 +12,7 @@ From: ghcr.io/astral-sh/uv@sha256:1cc0392c8aad8026ef3922e3f997fff0f31e506b0ffe95 export UV_CACHE_DIR="/vEcoli/.uv_cache" # runscripts/container/build-image.sh has some custom logic to replace this # with the environment variables that are set in .env - DOT_ENV_VARS + {{ dot_env_vars }} %labels application "Whole Cell Model Runtime Environment" diff --git a/runscripts/container/build-image.sh b/runscripts/container/build-image.sh index 7d29e66d3..1a55cbe9b 100755 --- a/runscripts/container/build-image.sh +++ b/runscripts/container/build-image.sh @@ -149,17 +149,6 @@ elif [ "$BUILD_APPTAINER" -ne 0 ]; then echo "Warning: .env not found" fi - # Read the Singularity file line by line - while IFS= read -r line; do - if [[ "$line" == *"DOT_ENV_VARS"* ]]; then - echo "$DOT_ENV_VARS" >> "$TEMP_DEF" - else - # Otherwise just add the line as-is - echo "$line" >>"$TEMP_DEF" - fi - done Date: Tue, 5 Aug 2025 17:13:41 -0700 Subject: [PATCH 60/71] Handle multiple vars in .env and reduce output --- runscripts/container/build-image.sh | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/runscripts/container/build-image.sh b/runscripts/container/build-image.sh index 1a55cbe9b..87be4e6ca 100755 --- a/runscripts/container/build-image.sh +++ b/runscripts/container/build-image.sh @@ -96,11 +96,9 @@ elif [ "$BUILD_APPTAINER" -ne 0 ]; then grep -v "^#" "$ignore_file" | grep -v "^$" | grep -v "^!" | while read -r pattern; do # Handle patterns starting with / (root-relative) if [[ "$pattern" == /* ]]; then - echo ".${pattern}" >>"$EXCLUDE_PATTERNS" echo ".${pattern}/*" >>"$EXCLUDE_PATTERNS" # Handle directory patterns ending with / elif [[ "$pattern" == */ ]]; then - echo "./${pattern}" >>"$EXCLUDE_PATTERNS" echo "./${pattern}*" >>"$EXCLUDE_PATTERNS" # Handle other patterns else @@ -123,14 +121,14 @@ elif [ "$BUILD_APPTAINER" -ne 0 ]; then echo "Executing: $FIND_CMD" # Execute the dynamically generated find command - eval "$FIND_CMD -print0 | xargs -0 tar -cvf repo.tar" + eval "$FIND_CMD -print0 | xargs -0 tar -cf repo.tar" # Debug output - echo "Found $(du -sh repo.tar) of files to include in the image" + echo "Found $(du -sh repo.tar | awk '{print $1}') of files to include in the image" TEMP_FILES+=("repo.tar") # Initialize environment variables string - DOT_ENV_VARS="" + DOT_ENV_VARS=" " # Check if .env file exists if [ -f ".env" ]; then echo "Processing .env for Singularity environment..." @@ -141,7 +139,7 @@ elif [ "$BUILD_APPTAINER" -ne 0 ]; then # Strip any existing 'export ' prefix line=${line#export } # Add to environment variables string with export prefix - DOT_ENV_VARS+=" export $line"$'\n' + DOT_ENV_VARS+="export $line; " fi done < ".env" echo "Found $(echo "$DOT_ENV_VARS" | grep -c 'export ') environment variables" From c972c2cb2ddadd4979c8722faae059e4b8890561 Mon Sep 17 00:00:00 2001 From: Sean Cheah Date: Fri, 22 Aug 2025 19:21:51 -0700 Subject: [PATCH 61/71] Fix heading underline --- doc/hpc.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/hpc.rst b/doc/hpc.rst index d96c08553..b5d70e9b7 100644 --- a/doc/hpc.rst +++ b/doc/hpc.rst @@ -426,7 +426,7 @@ cloned repository may affect SLURM batch jobs submitted with this flag. .. _Download Results to Local from Sherlock: Download Results to Local from Sherlock -==================================== +======================================= It's recommended to turn to `Sherlock's Data Transfer documentation `_ From 13feaa1447257d2791f5158e0f17a7c31808f747 Mon Sep 17 00:00:00 2001 From: Sean Cheah Date: Fri, 22 Aug 2025 18:45:41 -0700 Subject: [PATCH 62/71] More robust `runscripts/analysis.py` option handling and docs --- doc/workflows.rst | 47 ++++++++++++++++++++++++++++-------------- runscripts/analysis.py | 42 ++++++++++++++++++++++++++++++------- 2 files changed, 66 insertions(+), 23 deletions(-) diff --git a/doc/workflows.rst b/doc/workflows.rst index 900744bec..cc3d093f4 100644 --- a/doc/workflows.rst +++ b/doc/workflows.rst @@ -332,23 +332,9 @@ options under the ``analysis_options`` key: 2 where the first element is the start and the second element is the end (exclusive) of a range of variant indices, initial seeds, or generations to filter data to before running analyses. Overrides corresponding non-range options. -- ``sim_data_path``: List of string paths to simulation data pickle files. If multiple - variants are given via ``variant`` or ``variant_range``, you must provide same number - of paths in the same order using this option. This option is mainly meant for internal use. - For a simpler alternative that also works if multiple experiment IDs are given with - ``experiment_id`` (variant indices may correspond to completely different variant - simulation data objects in different workflow runs), see ``variant_data_dir``. -- ``variant_metadata_path``: String path to ``metadata.json`` file saved by - :py:mod:`runscripts.create_variants` (see :ref:`variant_output`). This option is mainly - intended for internal use. For a simpler alternative that also works if multiple - experiment IDs are given via ``experiment_id``, see ``variant_data_dir``. -- ``variant_data_dir``: List of string paths to one or more directories containing - variant simulation data pickles and metadata saved by :py:mod:`runscripts.create_variants`. - Must provide one path for each experiment ID in ``experiment_ID`` and in the - same order. - ``validation_data_path``: List of string paths to validation data pickle files (generated by ParCa). Can pass any number of paths in any order and they will be - passed as is to analysis script ``plot`` functions. + passed as is to analysis script ``plot`` functions. Defaults to empty list. - ``outdir``: Local (relative or absolute) path to directory that serves as a prefix to the ``outdir`` argument for analysis script ``plot`` functions (see :ref:`analysis_template`). A copy of the configuration options @@ -369,6 +355,37 @@ options under the ``analysis_options`` key: ``single`` analyses 16 times. If you only want to run the ``single`` and ``multivariant`` analyses, specify ``["single", "multivariant"]`` using this option. +In addition to the options above, you MUST provide ONE of the following three sets of +additional options: + +1. ``variant_data_dir`` + + List of string paths to one or more directories containing + variant simulation data pickles and metadata saved by :py:mod:`runscripts.create_variants`. + Must provide exactly one path for each experiment ID in ``experiment_ID`` and in the + same order. This option is strongly recommended when analyzing data generated by + :py:mod:`runscripts.workflow`. + +2. ``sim_data_path`` AND ``variant`` / ``variant_range`` + + ``sim_data_path`` is a list of string paths to simulation data pickle files, one + corresponding to each variant ID in ``variant`` or ``variant_range``. Analysis + scripts will receive empty ``variant_metadata`` / ``variant_names`` dictionaries + and a ``sim_data_dict`` that maps each variant in ``variant`` or ``variant_range`` + to each simulation data file in ``sim_data_path``. This option is mainly meant for + analyzing one-off simulations run with :py:mod:`runscripts.sim` / + :py:mod:`ecoli.experiments.ecoli_master_sim`. + +3. ``sim_data_path``, ``variant_metadata_path``, AND ``variant`` / ``variant_range`` + + ``sim_data_path`` is a list of string paths to simulation data pickle files, one + corresponding to each variant ID in ``variant`` or ``variant_range``. + ``variant_metadata_path`` is a string path to a metadata JSON in + the same format as the ``metadata.json`` saved by :py:mod:`runscripts.create_variants` + (``{"variant name": {"variant ID 1": {"param 1": ..., ...}, ...}}``, + see :ref:`variant_output`). This is a middle ground between the above options that + gives the user maximum flexibility in mapping variants to simulation data files and + metadata. .. note:: You must also have the ``emitter_arg`` key in your config JSON with a ``out_dir`` or diff --git a/runscripts/analysis.py b/runscripts/analysis.py index ae03b123b..f40f5fc8b 100644 --- a/runscripts/analysis.py +++ b/runscripts/analysis.py @@ -105,6 +105,18 @@ def parse_variant_data_dir( return variant_metadata, sim_data_dict, variant_names +def make_sim_data_dict(exp_id: str, variants: list[int], sim_data_path: list[str]): + if len(variants) == 0: + raise ValueError( + "Must specify variant or variant_range if not using variant_data_dir" + ) + if len(sim_data_path) != len(variants): + raise ValueError( + "Must specify sim_data_path for each variant if not using variant_data_dir" + ) + return {exp_id: dict(zip(variants, sim_data_path))} + + def main(): parser = argparse.ArgumentParser() default_config = os.path.join(CONFIG_DIR_PATH, "default.json") @@ -259,6 +271,8 @@ def main(): duckdb_filter = " AND ".join(duckdb_filter) # Load variant metadata + if "experiment_id" not in config: + raise KeyError("Must provide at least one experiment ID with experiment_id") if len(config["experiment_id"]) > 1: assert "variant_data_dir" in config, ( "Must provide --variant_data_dir for each experiment ID." @@ -276,7 +290,7 @@ def main(): variant_metadata, sim_data_dict, variant_names = parse_variant_data_dir( config["experiment_id"], config["variant_data_dir"] ) - else: + elif "variant_metadata_path" in config: with open(config["variant_metadata_path"], "r") as f: variant_metadata = json.load(f) variant_name = list(variant_metadata.keys())[0] @@ -285,14 +299,26 @@ def main(): int(k): v for k, v in variant_metadata[variant_name].items() } } - sim_data_dict = { - config["experiment_id"][0]: dict( - zip(config["variant"], config["sim_data_path"]) - ) - } - variant_names = {config["experiment_id"][0]: variant_name} + variant_names = {config["experiment_id"][0]: variant_name} + sim_data_dict = make_sim_data_dict( + config["experiment_id"][0], + config.get("variant", []), + config.get("sim_data_path", []), + ) + else: + warnings.warn( + "No variant metadata provided. Using empty variant metadata/names dictionaries." + ) + variant_metadata = {config["experiment_id"][0]: {}} + variant_names = {config["experiment_id"][0]: None} + sim_data_dict = make_sim_data_dict( + config["experiment_id"][0], + config.get("variant", []), + config.get("sim_data_path", []), + ) # Save copy of config JSON with parameters for plots + os.makedirs(config["outdir"], exist_ok=True) metadata_path = os.path.join(os.path.abspath(config["outdir"]), "metadata.json") if os.path.exists(metadata_path): raise FileExistsError( @@ -376,7 +402,7 @@ def main(): config_q, success_q, sim_data_dict, - config["validation_data_path"], + config.get("validation_data_path", []), curr_outdir, variant_metadata, variant_names, From c475eeae41948bd8cbd59330f8d6f373b542b321 Mon Sep 17 00:00:00 2001 From: Sean Cheah Date: Fri, 22 Aug 2025 19:07:52 -0700 Subject: [PATCH 63/71] More robust image path handling and docs --- doc/hpc.rst | 18 ++++++++++++++++-- runscripts/workflow.py | 6 ++++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/doc/hpc.rst b/doc/hpc.rst index b5d70e9b7..31deafd86 100644 --- a/doc/hpc.rst +++ b/doc/hpc.rst @@ -128,6 +128,13 @@ See :ref:`sherlock-config` for a description of the Sherlock-specific configuration options and :ref:`sherlock-running` for details about running a workflow on Sherlock. +.. note:: + ``test_sherlock.json`` sets ``out_dir`` to ``.``. In relative path syntax, + this refers to the current directory, meaning the cloned repo. This makes + the configuration portable as it does not assume the presence of any other + folders. However, as noted in :ref:`sherlock-config`, we recommend changing + this in your workflows. + To run scripts on Sherlock outside a workflow, see :ref:`sherlock-interactive`. To run scripts on Sherlock through a SLURM batch script, see :ref:`sherlock-noninteractive`. @@ -196,8 +203,15 @@ In addition to these options, you **MUST** set the emitter output directory enough space to store your workflow outputs. .. important:: - We recommend setting ``emitter_arg`` to a location in your ``$SCRATCH`` directory (e.g. ``"out_dir": "/scratch/users/{username}/out"``), - since ``$HOME`` only has a pretty small storage limit (run ``sh_quota`` to view). + We recommend setting ``out_dir`` under ``emitter_arg`` to a location in your + ``$SCRATCH`` directory to circumvent the ``$HOME`` storage limit + (run ``sh_quota`` to view). One way to do this is using an absolute path + (e.g. ``/scratch/users/{username}``). Alternatively, you can create a + symlink to your scratch directory by running the following command inside + your cloned repository: ``ln -s /scratch/users/{username} out`` (delete + ``out`` in your cloned repo first if it already exists). Then, using ``out`` + for ``out_dir`` will cause all simulation output to be redirected to your + scratch directory. If using the Parquet emitter and ``threaded`` is not set to false under ``emitter_arg``, a warning will be printed suggesting that you set ``threaded`` diff --git a/runscripts/workflow.py b/runscripts/workflow.py index c906b57df..c310d839a 100644 --- a/runscripts/workflow.py +++ b/runscripts/workflow.py @@ -509,6 +509,12 @@ def main(): container_image = sherlock_config.get("container_image", None) if container_image is None: raise RuntimeError("Must supply name for container image.") + image_dir = os.path.dirname(container_image) + if not os.path.exists(image_dir): + warnings.warn( + f"Container image directory does not exist, creating: {image_dir}." + ) + os.makedirs(image_dir, exist_ok=True) if sherlock_config.get("build_image", False): image_cmd = " ".join(build_image_cmd(container_image, True)) image_build_script = os.path.join(local_outdir, "container.sh") From e1e70b5a2dbda43629ccb54aaab0377ba5542e24 Mon Sep 17 00:00:00 2001 From: Sean Cheah Date: Fri, 22 Aug 2025 19:20:20 -0700 Subject: [PATCH 64/71] Link docs in template configs --- configs/templates/analysis_standalone.json | 4 ++++ configs/templates/create_variants_standalone.json | 4 ++++ configs/templates/parca_standalone.json | 4 ++++ 3 files changed, 12 insertions(+) diff --git a/configs/templates/analysis_standalone.json b/configs/templates/analysis_standalone.json index 9fbc1d9b2..a4eaab524 100644 --- a/configs/templates/analysis_standalone.json +++ b/configs/templates/analysis_standalone.json @@ -1,4 +1,8 @@ { + # This file shows a typical set of configuration options for running "runscripts/analysis.py" + # Your own config file may have fewer or more options depending on your needs + # Refer to https://covertlab.github.io/vEcoli/workflows.html#analysis-config + # Delete these comments from your own config file "emitter_arg": { "out_dir": "out_dir/from/simulation/run" }, diff --git a/configs/templates/create_variants_standalone.json b/configs/templates/create_variants_standalone.json index c02056a7b..2f9fda83a 100644 --- a/configs/templates/create_variants_standalone.json +++ b/configs/templates/create_variants_standalone.json @@ -1,4 +1,8 @@ { + # This file shows a typical set of configuration options for running "runscripts/create_variants.py" + # Your own config file may have fewer or more options depending on your needs + # Refer to https://covertlab.github.io/vEcoli/workflows.html#id2 for details + # Delete these comments from your own config file "variants": { "some_variant_module": { "param1": { diff --git a/configs/templates/parca_standalone.json b/configs/templates/parca_standalone.json index 735b8e15f..55e0084c9 100644 --- a/configs/templates/parca_standalone.json +++ b/configs/templates/parca_standalone.json @@ -1,4 +1,8 @@ { + # This file shows a typical set of configuration options for running "runscripts/parca.py" + # Your own config file may have fewer or more options depending on your needs + # Refer to https://covertlab.github.io/vEcoli/workflows.html#configuration for details + # Delete these comments from your own config file "sim_data_path": null, "parca_options": { "cpus": 1, From 434b767032683c1d955898707fae6eab4afaad62 Mon Sep 17 00:00:00 2001 From: Sean Cheah Date: Mon, 25 Aug 2025 13:55:11 -0700 Subject: [PATCH 65/71] Document variant_data_dir generic path --- doc/workflows.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/workflows.rst b/doc/workflows.rst index cc3d093f4..3fd99eb8e 100644 --- a/doc/workflows.rst +++ b/doc/workflows.rst @@ -364,7 +364,8 @@ additional options: variant simulation data pickles and metadata saved by :py:mod:`runscripts.create_variants`. Must provide exactly one path for each experiment ID in ``experiment_ID`` and in the same order. This option is strongly recommended when analyzing data generated by - :py:mod:`runscripts.workflow`. + :py:mod:`runscripts.workflow`, in which case it should look like: + ``{out_dir}/{experiment_id}/variant_sim_data``. 2. ``sim_data_path`` AND ``variant`` / ``variant_range`` From 4188de1a171765ecf2ac54b5e8e5fd6e0758efab Mon Sep 17 00:00:00 2001 From: Sean Cheah Date: Mon, 25 Aug 2025 14:04:19 -0700 Subject: [PATCH 66/71] Better error handling to address Copilot review --- runscripts/container/Singularity | 8 +++++--- runscripts/container/build-image.sh | 14 +++++++++----- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/runscripts/container/Singularity b/runscripts/container/Singularity index 33767e941..9a7c16411 100644 --- a/runscripts/container/Singularity +++ b/runscripts/container/Singularity @@ -25,9 +25,11 @@ From: ghcr.io/astral-sh/uv@sha256:1cc0392c8aad8026ef3922e3f997fff0f31e506b0ffe95 repo.tar /repo.tar %post - mkdir /vEcoli - tar -xf /repo.tar -C /vEcoli - rm /repo.tar + mkdir -p /vEcoli + if [ -f /repo.tar ]; then + tar -xf /repo.tar -C /vEcoli + rm /repo.tar + fi apt-get update && apt-get install -y gcc procps nano curl cd /vEcoli UV_CACHE_DIR="/vEcoli/.uv_cache" UV_COMPILE_BYTECODE=1 uv sync --frozen diff --git a/runscripts/container/build-image.sh b/runscripts/container/build-image.sh index 87be4e6ca..1ffc72236 100755 --- a/runscripts/container/build-image.sh +++ b/runscripts/container/build-image.sh @@ -79,10 +79,6 @@ if [ "$RUN_LOCAL" -ne 0 ]; then --build-arg git_branch="${GIT_BRANCH}" \ --build-arg timestamp="${TIMESTAMP}" . elif [ "$BUILD_APPTAINER" -ne 0 ]; then - # Create a temporary Singularity definition file - TEMP_DEF=$(mktemp) - TEMP_FILES+=("$TEMP_DEF") - # Create a temporary file for find exclude patterns EXCLUDE_PATTERNS=$(mktemp) TEMP_FILES+=("$EXCLUDE_PATTERNS") @@ -121,7 +117,15 @@ elif [ "$BUILD_APPTAINER" -ne 0 ]; then echo "Executing: $FIND_CMD" # Execute the dynamically generated find command - eval "$FIND_CMD -print0 | xargs -0 tar -cf repo.tar" + TEMP_FILE_LIST=$(mktemp) + TEMP_FILES+=("$TEMP_FILE_LIST") + eval "$FIND_CMD -print0" > "$TEMP_FILE_LIST" + if [ -s "$TEMP_FILE_LIST" ]; then + tar -cf repo.tar --null -T "$TEMP_FILE_LIST" + else + echo "ERROR: No files found to include in the image" + exit 1 + fi # Debug output echo "Found $(du -sh repo.tar | awk '{print $1}') of files to include in the image" From 3c8d02daa5ff5d0e0820d9e58a9a5cde24fcafb9 Mon Sep 17 00:00:00 2001 From: Sean Cheah Date: Mon, 25 Aug 2025 15:47:00 -0700 Subject: [PATCH 67/71] Exclude test_sherlock output from later images --- .dockerignore | 4 ++++ .gitignore | 4 ++++ doc/hpc.rst | 5 ++++- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/.dockerignore b/.dockerignore index 83083a723..a38f245c3 100644 --- a/.dockerignore +++ b/.dockerignore @@ -64,3 +64,7 @@ trace-* # uv venv # ########### .venv/ + +# Sherlock test # +################# +test_sherlock/ diff --git a/.gitignore b/.gitignore index d2763b856..a5864a83a 100644 --- a/.gitignore +++ b/.gitignore @@ -63,3 +63,7 @@ trace-* ####################### .venv/ +# Sherlock test # +################# +test_sherlock/ + diff --git a/doc/hpc.rst b/doc/hpc.rst index 31deafd86..a06dde848 100644 --- a/doc/hpc.rst +++ b/doc/hpc.rst @@ -183,7 +183,10 @@ keys in your configuration JSON (note the top-level ``sherlock`` key): "sherlock": { # Boolean, whether to build a fresh Apptainer image. If files that are # not excluded by .dockerignore did not change since your last build, - # you can set this to false to skip building the image. + # you can set this to false to skip building the image. DO NOT set this + # to a location in the cloned repo or else the resulting image(s) will be + # included in future image builds. test_sherlock.json is an exception + # because the test_sherlock folder is ignored by .dockerignore. "build_image": true, # Path (relative or absolute, including file name) of Apptainer image to # build (or use directly, if build_image is false) From 5877c8289ca63dc1dfe34a8f717356066f7d0df9 Mon Sep 17 00:00:00 2001 From: Sean Cheah Date: Tue, 26 Aug 2025 10:55:33 -0700 Subject: [PATCH 68/71] Get absolute path for Sherlock image dir --- runscripts/workflow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runscripts/workflow.py b/runscripts/workflow.py index c310d839a..b77df01be 100644 --- a/runscripts/workflow.py +++ b/runscripts/workflow.py @@ -509,7 +509,7 @@ def main(): container_image = sherlock_config.get("container_image", None) if container_image is None: raise RuntimeError("Must supply name for container image.") - image_dir = os.path.dirname(container_image) + image_dir = os.path.abspath(os.path.dirname(container_image)) if not os.path.exists(image_dir): warnings.warn( f"Container image directory does not exist, creating: {image_dir}." From 0e9bc4ed20249b45fb37df79da3b927b68c4c8e5 Mon Sep 17 00:00:00 2001 From: Sean Cheah Date: Thu, 4 Sep 2025 11:32:24 -0700 Subject: [PATCH 69/71] Configurable generation skipping for protein counts validation --- .../multiseed/protein_counts_validation.py | 24 ++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/ecoli/analysis/multiseed/protein_counts_validation.py b/ecoli/analysis/multiseed/protein_counts_validation.py index 2dc6ce1ad..d47c9acdf 100644 --- a/ecoli/analysis/multiseed/protein_counts_validation.py +++ b/ecoli/analysis/multiseed/protein_counts_validation.py @@ -1,6 +1,6 @@ import os import pickle -from typing import Any +from typing import Any, cast from duckdb import DuckDBPyConnection import numpy as np @@ -12,6 +12,7 @@ open_arbitrary_sim_data, ndlist_to_ndarray, read_stacked_columns, + skip_n_gens, ) from wholecell.utils.protein_counts import get_simulated_validation_counts @@ -28,14 +29,31 @@ def plot( variant_metadata: dict[str, dict[int, Any]], variant_names: dict[str, str], ): + """ + Plot average monomer counts in simulation against Schmidt 2015 and Wisniewski 2014. + + Args: + params: Dictionary containing parameters of the format:: + + { + # Number of initial generations worth of data to skip + "skip_n_gens": int + } + + """ with open_arbitrary_sim_data(sim_data_paths) as f: sim_data = pickle.load(f) with open(validation_data_paths[0], "rb") as f: validation_data = pickle.load(f) - subquery = read_stacked_columns( - history_sql, ["listeners__monomer_counts"], order_results=False + subquery = cast( + str, + read_stacked_columns( + history_sql, ["listeners__monomer_counts"], order_results=False + ), ) + if params.get("skip_n_gens"): + subquery = skip_n_gens(subquery, params["skip_n_gens"]) monomer_counts = conn.sql(f""" WITH unnested_counts AS ( SELECT unnest(listeners__monomer_counts) AS counts, From bc1c7115553ada56ad3aab2314bf2c4555587fed Mon Sep 17 00:00:00 2001 From: Sean Cheah Date: Thu, 4 Sep 2025 11:36:57 -0700 Subject: [PATCH 70/71] Fix generation skipping logic --- ecoli/library/parquet_emitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ecoli/library/parquet_emitter.py b/ecoli/library/parquet_emitter.py index 903fd3132..026094ef0 100644 --- a/ecoli/library/parquet_emitter.py +++ b/ecoli/library/parquet_emitter.py @@ -197,7 +197,7 @@ def skip_n_gens(subquery: str, n: int) -> str: """ Modifies a DuckDB SQL query to skip the first ``n`` generations of data. """ - return f"SELECT * FROM ({subquery}) WHERE generation >= {n}" + return f"SELECT * FROM ({subquery}) WHERE generation > {n}" def ndlist_to_ndarray(s) -> np.ndarray: From 7f4124eec9f75e1719353e1c093ed611e9b2f5bc Mon Sep 17 00:00:00 2001 From: HSMSC <144365476+HSMSC@users.noreply.github.com> Date: Wed, 17 Sep 2025 21:43:55 -0700 Subject: [PATCH 71/71] Update media_recipes.tsv adding arginine to minimal media --- reconstruction/ecoli/flat/condition/media_recipes.tsv | 1 + 1 file changed, 1 insertion(+) diff --git a/reconstruction/ecoli/flat/condition/media_recipes.tsv b/reconstruction/ecoli/flat/condition/media_recipes.tsv index 429fdbe54..2df4dc1f4 100644 --- a/reconstruction/ecoli/flat/condition/media_recipes.tsv +++ b/reconstruction/ecoli/flat/condition/media_recipes.tsv @@ -20,3 +20,4 @@ "minimal_plus_quercetin" "MIX0-57" 1.0 "" 0 ["CPD-520"] [Infinity] [] [] "minimal_plus_sam" "MIX0-57" 1.0 "" 0 ["S-ADENOSYLMETHIONINE"] [Infinity] [] [] "minimal_plus_tungstate" "MIX0-57" 1.0 "" 0 ["TUNGSTATE"] [Infinity] [] [] +"minimal_plus_arginine" "MIX0-57" 1.0 "" 0 ["Arginine"] [Infinity] [] []