forked from JScottAndrews2/pragmatic_programming
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmessy_main.py
More file actions
80 lines (67 loc) · 3.48 KB
/
Copy pathmessy_main.py
File metadata and controls
80 lines (67 loc) · 3.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import pandas as pd
# NOTE: unused import
import numpy as np
# NOTE: same package called twice
from factor_analyzer import FactorAnalyzer
from factor_analyzer import ConfirmatoryFactorAnalyzer, ModelSpecificationParser
data = pd.read_csv("data/satisfaction/survey (1).csv")
# NOTE:sloppy drop of last column, which is assumed to be the dependent variable
data = data.iloc[:, 0:-1]
features = ['A1a', 'A1b', 'A2a', 'A2b', 'A2c', 'A2d', 'A2e', 'A2f', 'A2g', 'A2h', 'A3a', 'A3b', 'A3c', 'A3d',
'A3e', 'A3f', 'A3g', 'A3h', 'A3i', 'A3j', 'A3k', 'A3l', 'A3m', 'A4', 'A4ai', 'A4aii', 'A4aiii',
'A4aiv', 'A4av', 'A4avi', 'A4avii', 'B1a', 'B1b', 'B2a', 'B2b', 'B2c', 'B2d', 'B3a', 'B3b', 'B3c',
'B3d', 'B4a', 'B4b', 'B4c', 'B4d', 'B5a', 'B5b', 'B5c', 'B5d', 'B6', 'B7a', 'B7b', 'B7c', 'B7d',
'B7e', 'B8', 'B8ai', 'B8aii', 'B8aiii', 'B8aiv', 'B8av', 'B8avi', 'B8avii', 'B9', 'C1a', 'C1b', 'C1c',
'C1d', 'C1e', 'C1f', 'C1g', 'C2', 'C3', 'C4a', 'C4b', 'C4c', 'C4d', 'C4e', 'C4f', 'C4g', 'C4h', 'C4i',
'C4j', 'C4k', 'C4l', 'c4m', 'C4n', 'C4o', 'C4ii', 'C5', 'C6', 'C6ai', 'C6aii', 'C6aiii', 'C6aiv',
'C6av', 'C6avi', 'C6avii', 'C6aviii', 'C6aix', 'C6ax', 'C6axi', 'C7', 'C8', 'C9', 'D3', 'D8', 'D14']
dep_var = 'AgencySize'
na_counts = data.isna().sum()
# Keep only variables with missing counts lower than 100
keep_vars = [col for col in na_counts.index if na_counts[col] < 100]
# drop rows with missing values from the reduced variables dataset
clean_data = data[keep_vars].dropna().reset_index(drop=True)
# split data into train and test #
x = data[features]
y = data[dep_var]
# NOTE: import called within scripts instead of at the top
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30,
random_state=123)
fa = FactorAnalyzer(rotation='varimax')
# NOTE: OOOPS, forgot to pass in the training data...
fa.fit(clean_data)
# Check Eigenvalues
ev = fa.get_eigenvalues()
print(ev)
n_factors = 5
fa = FactorAnalyzer(rotation='varimax', n_factors=n_factors)
fa.fit(clean_data.values)
loadings = fa.loadings_
loadings = pd.DataFrame(loadings)
loadings.index = clean_data.columns
factors = loadings.idxmax(axis=1)
factors.columns = ['factor_number']
# NOTE: this loop can be greatly condensed using a list comprehension
scale_dict = {}
for current_factor in range(n_factors):
item_names = []
for var in factors.iteritems():
if var[1] == current_factor:
item_names.append(var[0])
scale_dict[current_factor] = item_names
# NOTE: what am I doing here? I really should leave a meaningful note!!
vars = [item for sublist in scale_dict.values() for item in sublist]
cfa_data = clean_data[vars]
model_spec = ModelSpecificationParser.parse_model_specification_from_dict(cfa_data, scale_dict)
cfa = ConfirmatoryFactorAnalyzer(model_spec, disp=False)
cfa.fit(cfa_data.values)
# NOTE: did I want to randomly print this? Look for unused code and remove it.
cfa.loadings_
cfa.get_standard_errors()
# OK, things are already too messy for me and I stopped!
# There would be several hundred more lines of difficult to navigate code and I don't feel like making more of that for
# no real purpose.
#Let's talk about some straegies to help clean-up this code and also make is more dynamic and re-usable.
# Think about this, if you find that you always need to split your data,
# why not make a class to handle data for every project you work on?