{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 1. Imports" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import sys\n", "import os\n", "import pandas as pd\n", "from IPython.display import display\n", "\n", "\n", "sys.path.append(os.path.abspath(os.path.join(os.getcwd(), \"..\", \"..\")))\n", "\n", "from Seq_Sim.utils.seq_sim_utils import (\n", " load_config,\n", " generate_and_save_features\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 2. Specify Number of Samples and Fold Change" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# specify the number of samples to generate\n", "num_samples = 10\n", "\n", "# specify the fold change between the two classes\n", "fold_change = 0.1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 3. Specify Configuration File Parameters" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# config_file = \"../config.yml\"\n", "# config = load_config(config_file)\n", "\n", "# or\n", "\n", "config = {\n", " \"log_file\": \"error.log\",\n", " \"data_file_path\": \"./data/\",\n", " \"file_path_to_simulation\": \"Seq_Sim/seq_sim.py\",\n", " \"functions_script_path\": \"Seq_Sim/utils/seq_sim_utils.py\",\n", " \"file_prefix\": \"sim_data\",\n", " \"num_samples\": [10, 20, 30],\n", " \"fold_changes\": [0.1, 0.75, 1.5, 3],\n", " \"n_threads\": 4,\n", " \"dummy_dataset_params\": {\n", " \"n_cells\": 100,\n", " \"sd_celltypes\": 0.1,\n", " \"n_major_cell_types\": 7,\n", " \"n_minor_cell_types\": 3,\n", " \"relative_abundance\": 0.4,\n", " \"n_major_diff_celltypes\": 1,\n", " \"n_minor_diff_celltypes\": 1,\n", " \"n_batchs\": 4,\n", " \"prop_sex\": 0.5,\n", " \"prop_disease\": 0.5,\n", " \"seed\": 1234,\n", " \"n_features\": 1000,\n", " },\n", " \"variance_attributes\": {\"cluster_ratio\": 0.7},\n", " \"ratio_variance\": 0.1,\n", " \"column_information\": {\n", " \"cluster_col\": \"cell_type\",\n", " \"disease_col\": \"disease\",\n", " \"individual_col\": \"subject_id\",\n", " },\n", " \"files_to_save\": {\"feature_matrix\": True, \"latent_factors\": True},\n", "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 4. Generate and Save Sequencing Data" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "try:\n", " # run the simulation\n", " generate_and_save_features(num_samples, fold_change, config)\n", "\n", "except Exception as e:\n", " # log the error\n", " print(e)\n", " sys.exit(1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 5. Ensure files were saved properly" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Preview of sim_data_pseudo_feature_num_samples_10_fc_0.1.csv:\n" ] }, { "data": { "text/html": [ "
| \n", " | Feature1 | \n", "Feature2 | \n", "Feature3 | \n", "Feature4 | \n", "Feature5 | \n", "Feature6 | \n", "Feature7 | \n", "Feature8 | \n", "Feature9 | \n", "Feature10 | \n", "... | \n", "Feature991 | \n", "Feature992 | \n", "Feature993 | \n", "Feature994 | \n", "Feature995 | \n", "Feature996 | \n", "Feature997 | \n", "Feature998 | \n", "Feature999 | \n", "Feature1000 | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "-11942.239893 | \n", "12.943553 | \n", "-318.847654 | \n", "2695.776328 | \n", "-2139.814545 | \n", "-717.146778 | \n", "1539.996661 | \n", "-4706.707206 | \n", "23.007149 | \n", "1162.154929 | \n", "... | \n", "-2224.586927 | \n", "1495.867963 | \n", "-2690.879528 | \n", "37.646723 | \n", "1435.085427 | \n", "13271.111625 | \n", "659.564345 | \n", "-5542.467489 | \n", "-3158.842850 | \n", "4542.571977 | \n", "
| 1 | \n", "2439.293515 | \n", "-10658.214127 | \n", "4404.059579 | \n", "1752.411046 | \n", "-1133.866572 | \n", "-3194.139650 | \n", "4731.903754 | \n", "-5268.524990 | \n", "7.737832 | \n", "-5783.376093 | \n", "... | \n", "8555.393323 | \n", "-5691.212264 | \n", "4472.742030 | \n", "30.222789 | \n", "5634.529341 | \n", "-3354.225469 | \n", "-4704.306326 | \n", "-1173.377176 | \n", "11995.770742 | \n", "-1147.780931 | \n", "
| 2 | \n", "4031.830996 | \n", "-1820.018802 | \n", "-10320.317301 | \n", "-11446.265464 | \n", "3671.921902 | \n", "11080.449494 | \n", "9256.739749 | \n", "-5847.214536 | \n", "-2693.503273 | \n", "4293.121097 | \n", "... | \n", "-5144.520559 | \n", "3127.773453 | \n", "-122.378148 | \n", "11404.342330 | \n", "3453.546819 | \n", "1726.924090 | \n", "3215.562899 | \n", "3018.529486 | \n", "-3643.919678 | \n", "-3268.666481 | \n", "
| 3 | \n", "2237.486889 | \n", "25.665411 | \n", "-3193.970434 | \n", "6601.299042 | \n", "11282.278936 | \n", "2098.872337 | \n", "684.774436 | \n", "-2661.093197 | \n", "4053.436800 | \n", "-2418.361719 | \n", "... | \n", "-3363.379895 | \n", "-271.982509 | \n", "-2515.471402 | \n", "30.658347 | \n", "-3878.675356 | \n", "5441.705662 | \n", "-15459.669432 | \n", "-3635.072480 | \n", "-223.881934 | \n", "-8847.167633 | \n", "
| 4 | \n", "21.487903 | \n", "4117.289188 | \n", "2853.703536 | \n", "-14819.088168 | \n", "3315.687257 | \n", "1110.804968 | \n", "5206.448494 | \n", "1028.313768 | \n", "-4602.918356 | \n", "-4420.918656 | \n", "... | \n", "-3415.907445 | \n", "2830.855292 | \n", "1928.415578 | \n", "37.842845 | \n", "1249.763068 | \n", "64.976906 | \n", "-12450.703767 | \n", "-3026.076038 | \n", "-345.078986 | \n", "-9825.684247 | \n", "
5 rows × 1000 columns
\n", "| \n", " | subject_id | \n", "sex | \n", "disease | \n", "age | \n", "batch | \n", "bmi | \n", "cell_type | \n", "
|---|---|---|---|---|---|---|---|
| 0 | \n", "SUB_3 | \n", "0 | \n", "1 | \n", "18 | \n", "3 | \n", "34 | \n", "E | \n", "
| 1 | \n", "SUB_3 | \n", "0 | \n", "1 | \n", "18 | \n", "3 | \n", "34 | \n", "G | \n", "
| 2 | \n", "SUB_5 | \n", "1 | \n", "0 | \n", "29 | \n", "1 | \n", "32 | \n", "F | \n", "
| 3 | \n", "SUB_7 | \n", "0 | \n", "1 | \n", "21 | \n", "3 | \n", "34 | \n", "G | \n", "
| 4 | \n", "SUB_1 | \n", "0 | \n", "0 | \n", "57 | \n", "1 | \n", "16 | \n", "E | \n", "