1. Imports¶

[1]:

import sys
import os
import pandas as pd
from IPython.display import display


sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..", "..")))

from Seq_Sim.utils.seq_sim_utils import (
    load_config,
    generate_and_save_features
)

2. Specify Number of Samples and Fold Change¶

[2]:

# specify the number of samples to generate
num_samples = 10

# specify the fold change between the two classes
fold_change = 0.1

3. Specify Configuration File Parameters¶

[3]:

# config_file = "../config.yml"
# config = load_config(config_file)

# or

config = {
    "log_file": "error.log",
    "data_file_path": "./data/",
    "file_path_to_simulation": "Seq_Sim/seq_sim.py",
    "functions_script_path": "Seq_Sim/utils/seq_sim_utils.py",
    "file_prefix": "sim_data",
    "num_samples": [10, 20, 30],
    "fold_changes": [0.1, 0.75, 1.5, 3],
    "n_threads": 4,
    "dummy_dataset_params": {
        "n_cells": 100,
        "sd_celltypes": 0.1,
        "n_major_cell_types": 7,
        "n_minor_cell_types": 3,
        "relative_abundance": 0.4,
        "n_major_diff_celltypes": 1,
        "n_minor_diff_celltypes": 1,
        "n_batchs": 4,
        "prop_sex": 0.5,
        "prop_disease": 0.5,
        "seed": 1234,
        "n_features": 1000,
    },
    "variance_attributes": {"cluster_ratio": 0.7},
    "ratio_variance": 0.1,
    "column_information": {
        "cluster_col": "cell_type",
        "disease_col": "disease",
        "individual_col": "subject_id",
    },
    "files_to_save": {"feature_matrix": True, "latent_factors": True},
}

4. Generate and Save Sequencing Data¶

[4]:

try:
    # run the simulation
    generate_and_save_features(num_samples, fold_change, config)

except Exception as e:
    # log the error
    print(e)
    sys.exit(1)

5. Ensure files were saved properly¶

[5]:

# List all files in the directory specified in the configuration
files = os.listdir(config["data_file_path"])

# Loop through the files and display their content
for file in files:
    file_path = os.path.join(config["data_file_path"], file)

    # Check if the file is a CSV (to avoid errors)
    if file.endswith(".csv"):
        df = pd.read_csv(file_path)

        # Display the first few rows of the DataFrame
        print(f"Preview of {file}:")
        display(df.head())  # Prettier display in Jupyter Notebook
        print("\n")  # Add some spacing between tables

Preview of sim_data_pseudo_feature_num_samples_10_fc_0.1.csv:

	Feature1	Feature2	Feature3	Feature4	Feature5	Feature6	Feature7	Feature8	Feature9	Feature10	...	Feature991	Feature992	Feature993	Feature994	Feature995	Feature996	Feature997	Feature998	Feature999	Feature1000
0	-11942.239893	12.943553	-318.847654	2695.776328	-2139.814545	-717.146778	1539.996661	-4706.707206	23.007149	1162.154929	...	-2224.586927	1495.867963	-2690.879528	37.646723	1435.085427	13271.111625	659.564345	-5542.467489	-3158.842850	4542.571977
1	2439.293515	-10658.214127	4404.059579	1752.411046	-1133.866572	-3194.139650	4731.903754	-5268.524990	7.737832	-5783.376093	...	8555.393323	-5691.212264	4472.742030	30.222789	5634.529341	-3354.225469	-4704.306326	-1173.377176	11995.770742	-1147.780931
2	4031.830996	-1820.018802	-10320.317301	-11446.265464	3671.921902	11080.449494	9256.739749	-5847.214536	-2693.503273	4293.121097	...	-5144.520559	3127.773453	-122.378148	11404.342330	3453.546819	1726.924090	3215.562899	3018.529486	-3643.919678	-3268.666481
3	2237.486889	25.665411	-3193.970434	6601.299042	11282.278936	2098.872337	684.774436	-2661.093197	4053.436800	-2418.361719	...	-3363.379895	-271.982509	-2515.471402	30.658347	-3878.675356	5441.705662	-15459.669432	-3635.072480	-223.881934	-8847.167633
4	21.487903	4117.289188	2853.703536	-14819.088168	3315.687257	1110.804968	5206.448494	1028.313768	-4602.918356	-4420.918656	...	-3415.907445	2830.855292	1928.415578	37.842845	1249.763068	64.976906	-12450.703767	-3026.076038	-345.078986	-9825.684247

5 rows × 1000 columns



Preview of sim_data_latent_data_num_samples_10_fc_0.1.csv:

	subject_id	sex	disease	age	batch	bmi	cell_type
0	SUB_3	0	1	18	3	34	E
1	SUB_3	0	1	18	3	34	G
2	SUB_5	1	0	29	1	32	F
3	SUB_7	0	1	21	3	34	G
4	SUB_1	0	0	57	1	16	E

[ ]: