1. Imports

[1]:
import sys
import os
import pandas as pd
from IPython.display import display


sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..", "..")))

from Seq_Sim.utils.seq_sim_utils import (
    load_config,
    generate_and_save_features
)

2. Specify Number of Samples and Fold Change

[2]:
# specify the number of samples to generate
num_samples = 10

# specify the fold change between the two classes
fold_change = 0.1

3. Specify Configuration File Parameters

[3]:
# config_file = "../config.yml"
# config = load_config(config_file)

# or

config = {
    "log_file": "error.log",
    "data_file_path": "./data/",
    "file_path_to_simulation": "Seq_Sim/seq_sim.py",
    "functions_script_path": "Seq_Sim/utils/seq_sim_utils.py",
    "file_prefix": "sim_data",
    "num_samples": [10, 20, 30],
    "fold_changes": [0.1, 0.75, 1.5, 3],
    "n_threads": 4,
    "dummy_dataset_params": {
        "n_cells": 100,
        "sd_celltypes": 0.1,
        "n_major_cell_types": 7,
        "n_minor_cell_types": 3,
        "relative_abundance": 0.4,
        "n_major_diff_celltypes": 1,
        "n_minor_diff_celltypes": 1,
        "n_batchs": 4,
        "prop_sex": 0.5,
        "prop_disease": 0.5,
        "seed": 1234,
        "n_features": 1000,
    },
    "variance_attributes": {"cluster_ratio": 0.7},
    "ratio_variance": 0.1,
    "column_information": {
        "cluster_col": "cell_type",
        "disease_col": "disease",
        "individual_col": "subject_id",
    },
    "files_to_save": {"feature_matrix": True, "latent_factors": True},
}

4. Generate and Save Sequencing Data

[4]:
try:
    # run the simulation
    generate_and_save_features(num_samples, fold_change, config)

except Exception as e:
    # log the error
    print(e)
    sys.exit(1)

5. Ensure files were saved properly

[5]:
# List all files in the directory specified in the configuration
files = os.listdir(config["data_file_path"])

# Loop through the files and display their content
for file in files:
    file_path = os.path.join(config["data_file_path"], file)

    # Check if the file is a CSV (to avoid errors)
    if file.endswith(".csv"):
        df = pd.read_csv(file_path)

        # Display the first few rows of the DataFrame
        print(f"Preview of {file}:")
        display(df.head())  # Prettier display in Jupyter Notebook
        print("\n")  # Add some spacing between tables
Preview of sim_data_pseudo_feature_num_samples_10_fc_0.1.csv:
Feature1 Feature2 Feature3 Feature4 Feature5 Feature6 Feature7 Feature8 Feature9 Feature10 ... Feature991 Feature992 Feature993 Feature994 Feature995 Feature996 Feature997 Feature998 Feature999 Feature1000
0 -11942.239893 12.943553 -318.847654 2695.776328 -2139.814545 -717.146778 1539.996661 -4706.707206 23.007149 1162.154929 ... -2224.586927 1495.867963 -2690.879528 37.646723 1435.085427 13271.111625 659.564345 -5542.467489 -3158.842850 4542.571977
1 2439.293515 -10658.214127 4404.059579 1752.411046 -1133.866572 -3194.139650 4731.903754 -5268.524990 7.737832 -5783.376093 ... 8555.393323 -5691.212264 4472.742030 30.222789 5634.529341 -3354.225469 -4704.306326 -1173.377176 11995.770742 -1147.780931
2 4031.830996 -1820.018802 -10320.317301 -11446.265464 3671.921902 11080.449494 9256.739749 -5847.214536 -2693.503273 4293.121097 ... -5144.520559 3127.773453 -122.378148 11404.342330 3453.546819 1726.924090 3215.562899 3018.529486 -3643.919678 -3268.666481
3 2237.486889 25.665411 -3193.970434 6601.299042 11282.278936 2098.872337 684.774436 -2661.093197 4053.436800 -2418.361719 ... -3363.379895 -271.982509 -2515.471402 30.658347 -3878.675356 5441.705662 -15459.669432 -3635.072480 -223.881934 -8847.167633
4 21.487903 4117.289188 2853.703536 -14819.088168 3315.687257 1110.804968 5206.448494 1028.313768 -4602.918356 -4420.918656 ... -3415.907445 2830.855292 1928.415578 37.842845 1249.763068 64.976906 -12450.703767 -3026.076038 -345.078986 -9825.684247

5 rows × 1000 columns



Preview of sim_data_latent_data_num_samples_10_fc_0.1.csv:
subject_id sex disease age batch bmi cell_type
0 SUB_3 0 1 18 3 34 E
1 SUB_3 0 1 18 3 34 G
2 SUB_5 1 0 29 1 32 F
3 SUB_7 0 1 21 3 34 G
4 SUB_1 0 0 57 1 16 E


[ ]: