-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathVisualize.py
111 lines (67 loc) · 2.7 KB
/
Visualize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import matplotlib.pyplot as plt
import io
import pandas as pd
def visualize(case_df_cpm, control_df_cpm):
# Combine the case and control data
normalized_data = pd.concat([case_df_cpm, control_df_cpm])
# Normalize the combined data
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(normalized_data)
normalized_data = np.nan_to_num(normalized_data, nan=0)
print(normalized_data.shape)
assert np.sum(np.isnan(normalized_data)) == 0
# Compute t-SNE embedding
tsne = TSNE(n_components=2, perplexity=30, learning_rate=200)
tsne_embedding = tsne.fit_transform(normalized_data)
# Visualize the t-SNE embedding
case_samples_count = len(case_df_cpm)
control_samples_count = len(control_df_cpm)
# fig, ax = plt.subplots()
# stream = io.BytesIO()
# plt.figure(figsize=(10, 8))
stream = None
plt.scatter(tsne_embedding[:case_samples_count, 0], tsne_embedding[:case_samples_count, 1], color='red', label='Case')
plt.scatter(tsne_embedding[case_samples_count:, 0], tsne_embedding[case_samples_count:, 1], color='blue', label='Control')
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.title('t-SNE Visualization')
plt.legend()
plt.savefig("figure/clustering.png")
# fig.savefig(stream, format='png')
# stream.seek(0)
# plt.close(fig)
return stream
# plt.savefig('visualize.png')
# # Run t-SNE
# tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
# tsne_results = tsne.fit_transform(pca_result)
# # Plot t-SNE
# plt.figure(figsize=(8,8))
# plt.scatter(tsne_results[:,0], tsne_results[:,1])
# plt.xlabel('t-SNE 1')
# plt.ylabel('t-SNE 2')
# plt.show()
if __name__ == '__main__':
from quality_control import filter_low_counts
# df = pd.read_csv('read_counts.csv', index_col=0)
df = pd.read_csv('read_counts.csv', sep = '\t')
df_filtered = filter_low_counts(df)
from impute import impute_missing_values
df_imputed = impute_missing_values(df_filtered)
with open('case_label.txt') as fin:
lines = fin.readlines()
case_samples = [line.strip() for line in lines]
with open('control_label.txt') as fin:
lines = fin.readlines()
control_samples = [line.strip() for line in lines]
from Normalize import normalize_rnaseq_data
# print(case_samples, control_samples)
df, case_df_cpm, control_df_cpm = normalize_rnaseq_data(df_imputed, case_samples, control_samples)
print(df, case_df_cpm, control_df_cpm)
case_df_cpm = case_df_cpm[:1000]
control_df_cpm = control_df_cpm[:1000]
print(df, case_df_cpm, control_df_cpm)
stream = visualize(case_df_cpm, control_df_cpm)