{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 1. Imports" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import sys\n", "import os\n", "import pandas as pd\n", "from IPython.display import display\n", "\n", "\n", "sys.path.append(os.path.abspath(os.path.join(os.getcwd(), \"..\", \"..\")))\n", "\n", "from Seq_Sim.utils.seq_sim_utils import (\n", " load_config,\n", " generate_and_save_features\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 2. Specify Number of Samples and Fold Change" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# specify the number of samples to generate\n", "num_samples = 10\n", "\n", "# specify the fold change between the two classes\n", "fold_change = 0.1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 3. Specify Configuration File Parameters" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# config_file = \"../config.yml\"\n", "# config = load_config(config_file)\n", "\n", "# or\n", "\n", "config = {\n", " \"log_file\": \"error.log\",\n", " \"data_file_path\": \"./data/\",\n", " \"file_path_to_simulation\": \"Seq_Sim/seq_sim.py\",\n", " \"functions_script_path\": \"Seq_Sim/utils/seq_sim_utils.py\",\n", " \"file_prefix\": \"sim_data\",\n", " \"num_samples\": [10, 20, 30],\n", " \"fold_changes\": [0.1, 0.75, 1.5, 3],\n", " \"n_threads\": 4,\n", " \"dummy_dataset_params\": {\n", " \"n_cells\": 100,\n", " \"sd_celltypes\": 0.1,\n", " \"n_major_cell_types\": 7,\n", " \"n_minor_cell_types\": 3,\n", " \"relative_abundance\": 0.4,\n", " \"n_major_diff_celltypes\": 1,\n", " \"n_minor_diff_celltypes\": 1,\n", " \"n_batchs\": 4,\n", " \"prop_sex\": 0.5,\n", " \"prop_disease\": 0.5,\n", " \"seed\": 1234,\n", " \"n_features\": 1000,\n", " },\n", " \"variance_attributes\": {\"cluster_ratio\": 0.7},\n", " \"ratio_variance\": 0.1,\n", " \"column_information\": {\n", " \"cluster_col\": \"cell_type\",\n", " \"disease_col\": \"disease\",\n", " \"individual_col\": \"subject_id\",\n", " },\n", " \"files_to_save\": {\"feature_matrix\": True, \"latent_factors\": True},\n", "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 4. Generate and Save Sequencing Data" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "try:\n", " # run the simulation\n", " generate_and_save_features(num_samples, fold_change, config)\n", "\n", "except Exception as e:\n", " # log the error\n", " print(e)\n", " sys.exit(1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 5. Ensure files were saved properly" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Preview of sim_data_pseudo_feature_num_samples_10_fc_0.1.csv:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Feature1Feature2Feature3Feature4Feature5Feature6Feature7Feature8Feature9Feature10...Feature991Feature992Feature993Feature994Feature995Feature996Feature997Feature998Feature999Feature1000
0-11942.23989312.943553-318.8476542695.776328-2139.814545-717.1467781539.996661-4706.70720623.0071491162.154929...-2224.5869271495.867963-2690.87952837.6467231435.08542713271.111625659.564345-5542.467489-3158.8428504542.571977
12439.293515-10658.2141274404.0595791752.411046-1133.866572-3194.1396504731.903754-5268.5249907.737832-5783.376093...8555.393323-5691.2122644472.74203030.2227895634.529341-3354.225469-4704.306326-1173.37717611995.770742-1147.780931
24031.830996-1820.018802-10320.317301-11446.2654643671.92190211080.4494949256.739749-5847.214536-2693.5032734293.121097...-5144.5205593127.773453-122.37814811404.3423303453.5468191726.9240903215.5628993018.529486-3643.919678-3268.666481
32237.48688925.665411-3193.9704346601.29904211282.2789362098.872337684.774436-2661.0931974053.436800-2418.361719...-3363.379895-271.982509-2515.47140230.658347-3878.6753565441.705662-15459.669432-3635.072480-223.881934-8847.167633
421.4879034117.2891882853.703536-14819.0881683315.6872571110.8049685206.4484941028.313768-4602.918356-4420.918656...-3415.9074452830.8552921928.41557837.8428451249.76306864.976906-12450.703767-3026.076038-345.078986-9825.684247
\n", "

5 rows × 1000 columns

\n", "
" ], "text/plain": [ " Feature1 Feature2 Feature3 Feature4 Feature5 \\\n", "0 -11942.239893 12.943553 -318.847654 2695.776328 -2139.814545 \n", "1 2439.293515 -10658.214127 4404.059579 1752.411046 -1133.866572 \n", "2 4031.830996 -1820.018802 -10320.317301 -11446.265464 3671.921902 \n", "3 2237.486889 25.665411 -3193.970434 6601.299042 11282.278936 \n", "4 21.487903 4117.289188 2853.703536 -14819.088168 3315.687257 \n", "\n", " Feature6 Feature7 Feature8 Feature9 Feature10 ... \\\n", "0 -717.146778 1539.996661 -4706.707206 23.007149 1162.154929 ... \n", "1 -3194.139650 4731.903754 -5268.524990 7.737832 -5783.376093 ... \n", "2 11080.449494 9256.739749 -5847.214536 -2693.503273 4293.121097 ... \n", "3 2098.872337 684.774436 -2661.093197 4053.436800 -2418.361719 ... \n", "4 1110.804968 5206.448494 1028.313768 -4602.918356 -4420.918656 ... \n", "\n", " Feature991 Feature992 Feature993 Feature994 Feature995 \\\n", "0 -2224.586927 1495.867963 -2690.879528 37.646723 1435.085427 \n", "1 8555.393323 -5691.212264 4472.742030 30.222789 5634.529341 \n", "2 -5144.520559 3127.773453 -122.378148 11404.342330 3453.546819 \n", "3 -3363.379895 -271.982509 -2515.471402 30.658347 -3878.675356 \n", "4 -3415.907445 2830.855292 1928.415578 37.842845 1249.763068 \n", "\n", " Feature996 Feature997 Feature998 Feature999 Feature1000 \n", "0 13271.111625 659.564345 -5542.467489 -3158.842850 4542.571977 \n", "1 -3354.225469 -4704.306326 -1173.377176 11995.770742 -1147.780931 \n", "2 1726.924090 3215.562899 3018.529486 -3643.919678 -3268.666481 \n", "3 5441.705662 -15459.669432 -3635.072480 -223.881934 -8847.167633 \n", "4 64.976906 -12450.703767 -3026.076038 -345.078986 -9825.684247 \n", "\n", "[5 rows x 1000 columns]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "Preview of sim_data_latent_data_num_samples_10_fc_0.1.csv:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
subject_idsexdiseaseagebatchbmicell_type
0SUB_30118334E
1SUB_30118334G
2SUB_51029132F
3SUB_70121334G
4SUB_10057116E
\n", "
" ], "text/plain": [ " subject_id sex disease age batch bmi cell_type\n", "0 SUB_3 0 1 18 3 34 E\n", "1 SUB_3 0 1 18 3 34 G\n", "2 SUB_5 1 0 29 1 32 F\n", "3 SUB_7 0 1 21 3 34 G\n", "4 SUB_1 0 0 57 1 16 E" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n" ] } ], "source": [ "# List all files in the directory specified in the configuration\n", "files = os.listdir(config[\"data_file_path\"])\n", "\n", "# Loop through the files and display their content\n", "for file in files:\n", " file_path = os.path.join(config[\"data_file_path\"], file)\n", "\n", " # Check if the file is a CSV (to avoid errors)\n", " if file.endswith(\".csv\"):\n", " df = pd.read_csv(file_path)\n", "\n", " # Display the first few rows of the DataFrame\n", " print(f\"Preview of {file}:\")\n", " display(df.head()) # Prettier display in Jupyter Notebook\n", " print(\"\\n\") # Add some spacing between tables" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "chen5150", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.13" } }, "nbformat": 4, "nbformat_minor": 2 }