1. Imports¶
[1]:
import sys
import os
import pandas as pd
from IPython.display import display
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..", "..")))
from Seq_Sim.utils.seq_sim_utils import (
load_config,
generate_and_save_features
)
2. Specify Number of Samples and Fold Change¶
[2]:
# specify the number of samples to generate
num_samples = 10
# specify the fold change between the two classes
fold_change = 0.1
3. Specify Configuration File Parameters¶
[3]:
# config_file = "../config.yml"
# config = load_config(config_file)
# or
config = {
"log_file": "error.log",
"data_file_path": "./data/",
"file_path_to_simulation": "Seq_Sim/seq_sim.py",
"functions_script_path": "Seq_Sim/utils/seq_sim_utils.py",
"file_prefix": "sim_data",
"num_samples": [10, 20, 30],
"fold_changes": [0.1, 0.75, 1.5, 3],
"n_threads": 4,
"dummy_dataset_params": {
"n_cells": 100,
"sd_celltypes": 0.1,
"n_major_cell_types": 7,
"n_minor_cell_types": 3,
"relative_abundance": 0.4,
"n_major_diff_celltypes": 1,
"n_minor_diff_celltypes": 1,
"n_batchs": 4,
"prop_sex": 0.5,
"prop_disease": 0.5,
"seed": 1234,
"n_features": 1000,
},
"variance_attributes": {"cluster_ratio": 0.7},
"ratio_variance": 0.1,
"column_information": {
"cluster_col": "cell_type",
"disease_col": "disease",
"individual_col": "subject_id",
},
"files_to_save": {"feature_matrix": True, "latent_factors": True},
}
4. Generate and Save Sequencing Data¶
[4]:
try:
# run the simulation
generate_and_save_features(num_samples, fold_change, config)
except Exception as e:
# log the error
print(e)
sys.exit(1)
5. Ensure files were saved properly¶
[5]:
# List all files in the directory specified in the configuration
files = os.listdir(config["data_file_path"])
# Loop through the files and display their content
for file in files:
file_path = os.path.join(config["data_file_path"], file)
# Check if the file is a CSV (to avoid errors)
if file.endswith(".csv"):
df = pd.read_csv(file_path)
# Display the first few rows of the DataFrame
print(f"Preview of {file}:")
display(df.head()) # Prettier display in Jupyter Notebook
print("\n") # Add some spacing between tables
Preview of sim_data_pseudo_feature_num_samples_10_fc_0.1.csv:
| Feature1 | Feature2 | Feature3 | Feature4 | Feature5 | Feature6 | Feature7 | Feature8 | Feature9 | Feature10 | ... | Feature991 | Feature992 | Feature993 | Feature994 | Feature995 | Feature996 | Feature997 | Feature998 | Feature999 | Feature1000 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -11942.239893 | 12.943553 | -318.847654 | 2695.776328 | -2139.814545 | -717.146778 | 1539.996661 | -4706.707206 | 23.007149 | 1162.154929 | ... | -2224.586927 | 1495.867963 | -2690.879528 | 37.646723 | 1435.085427 | 13271.111625 | 659.564345 | -5542.467489 | -3158.842850 | 4542.571977 |
| 1 | 2439.293515 | -10658.214127 | 4404.059579 | 1752.411046 | -1133.866572 | -3194.139650 | 4731.903754 | -5268.524990 | 7.737832 | -5783.376093 | ... | 8555.393323 | -5691.212264 | 4472.742030 | 30.222789 | 5634.529341 | -3354.225469 | -4704.306326 | -1173.377176 | 11995.770742 | -1147.780931 |
| 2 | 4031.830996 | -1820.018802 | -10320.317301 | -11446.265464 | 3671.921902 | 11080.449494 | 9256.739749 | -5847.214536 | -2693.503273 | 4293.121097 | ... | -5144.520559 | 3127.773453 | -122.378148 | 11404.342330 | 3453.546819 | 1726.924090 | 3215.562899 | 3018.529486 | -3643.919678 | -3268.666481 |
| 3 | 2237.486889 | 25.665411 | -3193.970434 | 6601.299042 | 11282.278936 | 2098.872337 | 684.774436 | -2661.093197 | 4053.436800 | -2418.361719 | ... | -3363.379895 | -271.982509 | -2515.471402 | 30.658347 | -3878.675356 | 5441.705662 | -15459.669432 | -3635.072480 | -223.881934 | -8847.167633 |
| 4 | 21.487903 | 4117.289188 | 2853.703536 | -14819.088168 | 3315.687257 | 1110.804968 | 5206.448494 | 1028.313768 | -4602.918356 | -4420.918656 | ... | -3415.907445 | 2830.855292 | 1928.415578 | 37.842845 | 1249.763068 | 64.976906 | -12450.703767 | -3026.076038 | -345.078986 | -9825.684247 |
5 rows × 1000 columns
Preview of sim_data_latent_data_num_samples_10_fc_0.1.csv:
| subject_id | sex | disease | age | batch | bmi | cell_type | |
|---|---|---|---|---|---|---|---|
| 0 | SUB_3 | 0 | 1 | 18 | 3 | 34 | E |
| 1 | SUB_3 | 0 | 1 | 18 | 3 | 34 | G |
| 2 | SUB_5 | 1 | 0 | 29 | 1 | 32 | F |
| 3 | SUB_7 | 0 | 1 | 21 | 3 | 34 | G |
| 4 | SUB_1 | 0 | 0 | 57 | 1 | 16 | E |
[ ]: