Batch effect visualization

import pandas as pd
from hest.batch_effect import filter_hest_stromal_housekeeping, get_silhouette_score, plot_umap

meta_df = pd.read_csv('../hest_data/HEST_v1_2_0.csv')

meta_df = meta_df[
    (meta_df['st_technology'] == 'Visium') & 
    (meta_df['disease_state'] == 'Cancer') & 
    (meta_df['oncotree_code'] == 'IDC') & 
    (meta_df['species'] == 'Homo sapiens')
]

# We filter spots, such that:
# - we only keep the most stable housekeeping genes (based on https://housekeeping.unicamp.br/?download)
# - we only keep the spots under the stroma (based on CellViT segmentation)
adata_list = filter_hest_stromal_housekeeping(meta_df, hest_dir='../hest_data')

labels = meta_df['id'].values
score = get_silhouette_score(adata_list, labels=labels)
# (-1: strong overlap between clusters, can be indicative of a small batch effect, 1: poor overlap between clusters, can be indicative of a strong batch effect)
print(f'Silhouette score is {score}')


plot_umap(adata_list, labels, 'batch_whole_tissue.png', umap_kwargs={'min_dist': 0.6})
0it [00:00, ?it/s]
18:02:41 | DEBUG | Filtering TENX68
100%|██████████| 1/1 [00:03<00:00,  3.58s/it]
1it [00:05,  5.55s/it]
18:02:46 | DEBUG | Filtering TENX53
100%|██████████| 1/1 [00:00<00:00,  1.89it/s]
2it [00:18,  9.89s/it]
18:02:59 | DEBUG | Filtering TENX39
100%|██████████| 1/1 [00:00<00:00,  3.39it/s]
3it [00:21,  6.62s/it]
18:03:02 | DEBUG | Filtering TENX14
100%|██████████| 1/1 [00:00<00:00,  1.86it/s]
4it [00:27,  6.29s/it]
18:03:08 | DEBUG | Filtering TENX13
100%|██████████| 1/1 [00:00<00:00,  1.18it/s]
5it [00:31,  5.50s/it]
18:03:12 | DEBUG | Filtering NCBI776
100%|██████████| 1/1 [00:00<00:00,  1.40it/s]
6it [00:35,  5.27s/it]
18:03:17 | DEBUG | Filtering NCBI684
100%|██████████| 1/1 [00:00<00:00,  1.60it/s]
7it [00:43,  6.05s/it]
18:03:24 | DEBUG | Filtering NCBI683
100%|██████████| 1/1 [00:00<00:00,  1.34it/s]
8it [00:51,  6.57s/it]
18:03:32 | DEBUG | Filtering NCBI682
100%|██████████| 1/1 [00:00<00:00,  1.08it/s]
9it [01:00,  7.34s/it]
18:03:41 | DEBUG | Filtering NCBI681
100%|██████████| 1/1 [00:01<00:00,  1.12s/it]
10it [01:09,  6.93s/it]
/home/fedshyvana/anaconda3/envs/cuml/lib/python3.9/site-packages/anndata/_core/anndata.py:1818: UserWarning: Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
  utils.warn_names_duplicates("obs")
Silhouette score is -0.13047750294208527
/home/fedshyvana/anaconda3/envs/cuml/lib/python3.9/site-packages/umap/__init__.py:9: ImportWarning: Tensorflow not installed; ParametricUMAP will be unavailable
  warn(
/home/fedshyvana/anaconda3/envs/cuml/lib/python3.9/site-packages/anndata/_core/anndata.py:1818: UserWarning: Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
  utils.warn_names_duplicates("obs")
/home/fedshyvana/anaconda3/envs/cuml/lib/python3.9/site-packages/umap/umap_.py:1945: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
../_images/b50fed82059daef4edcb495871b3b4668c35f151f29fc5fb31bdf2901f21d1ab.png