-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathactualRunAnalysis.py
44 lines (43 loc) · 1.12 KB
/
actualRunAnalysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# %%
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as mse
from matplotlib import pyplot as plt
# %%
"""
Read in files
"""
trainingData = []
for i in range(1,4):
trainingData.append(pd.read_csv(f"observedRound{i}.txt"))
# %%
testData = pd.read_csv("observedRound4OnlyAkaTestData.txt")
# %%
"""
training model
"""
score = []
for i in range(len(trainingData)):
X, y = trainingData[i][["forward GC", "reverse GC", "Mg Conc", "dist"]], trainingData[i]["mean_ct"]
reg = RandomForestRegressor(n_estimators=3,random_state=0)
reg.fit(X,y)
score.append(mse(reg.predict(testData[["forward GC", "reverse GC", "Mg Conc", "dist"]]),testData["mean_ct"]))
# %%
"""
plotting
"""
plt.plot([len(trainingData[i]) for i in range(len(trainingData))], score)
plt.xlabel("number of instances queried")
plt.ylabel("mean squared error")
plt.show()
# %%
"""
predict all
"""
allInst = pd.read_csv("unobserved.txt")
yHat = reg.predict(allInst[["forward GC", "reverse GC", "Mg Conc", "dist"]])
# %%
yHatArgsort = np.argsort(yHat)
allInst.iloc[yHatArgsort[0:5]]
# %%