forked from alexmarozick/RapAnalysis
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfixdb.py
More file actions
144 lines (129 loc) · 4.55 KB
/
Copy pathfixdb.py
File metadata and controls
144 lines (129 loc) · 4.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
"""
This is my personal Jupyter Notebook I used for testing and curating my MongoDB database of lyrics
Cells are separated by #%% Character, I used this notebook in VSCode's Jupyter notebook extension
"""
#%%
import config
import json
import functools
import pymongo
from bson.objectid import ObjectId
from pprint import pprint as pp
import analyzeSong
mongoconnect = config.get("CLIENT", "MONGODB")
cluster = pymongo.mongo_client(f"{mongoconnect}")
db = cluster["Lyrics_Actual"]
colnames = db.list_collection_names()
#%%
print(colnames)
print(db[col].find())
for col in colnames:
collection = db[col]
docscursor = collection.find()
docs = [doc for doc in docscursor]
keys = [(doc.keys(),idx) for idx, doc in enumerate(docs) if "song" not in doc.keys()]
print(keys)
# check if artist has proper formatting, if first entry isnt formatted, whole artist wont be
i = 0
# loop through docs, reformat, replace with reformatted
print(f"WORKING WITH {col}")
for key, idx in keys:
_id = docs[idx]["_id"]
song = list(key)[1]
lyrics = docs[idx][song][0]
album = docs[idx][song][1]
colors = docs[idx][song][2]
if lyrics is None or colors is None:
print(f"{song} is not valid lyrics, skipping")
continue
else:
# song_list.append( {"song" : song.title.replace('.', "").replace("$","s"), "lyrics" : song.lyrics, "album" : album, "colors" : colors})
newdoc = {'song' : song, "lyrics" : lyrics, "album" : album, "colors" : colors}
collection.delete_one({"_id" : ObjectId(_id)})
collection.insert_one(newdoc)
print(f".", end="")
print(f"processed {i} songs ")
# %%
colnames = ['jid']
for col in colnames:
collection = db[col]
collection.create_index([('song', pymongo.TEXT)], name='search_index', default_language='english')
print("creatd index for {col}")
# %%
import config
import pymongo
import analyzeSong
from bson.objectid import ObjectId
mongoconnect = config.get("CLIENT", "MONGODB")
cluster = pymongo.MongoClient(mongoconnect)
db = cluster["Lyrics_Actual"]
#MADE IT FROM NAS TO THE GAME
colnames = db.list_collection_names()
for col in colnames:
collection = db[col]
print(f"Fixing {col}")
for doc in collection.find():
try:
print(doc['song'])
lyrics = doc['lyrics']
colors, marked = analyzeSong.parse_and_analyze_lyrics(cmd=False,args=lyrics)
collection.update_one({"song" : doc['song']},{"$set" : {"colors" : colors}} )
print(r".", end=r"")
except KeyError:
collection.delete_one({"_id" : ObjectId(doc['_id'])})
print("Invalid song deleted")
print("")
# %%
# FIX LIST OF ARTIST
import config
import pymongo
import analyzeSong
from bson.objectid import ObjectId
mongoconnect = config.get("CLIENT", "MONGODB")
cluster = pymongo.MongoClient(mongoconnect)
db = cluster["Lyrics_Actual"]
#MADE IT FROM NAS TO THE GAME
colname = ["tyler, the creator"]
for col in colname:
collection = db[col]
print(f"Fixing {col}")
for doc in collection.find():
try:
print(doc['song'])
lyrics = doc['lyrics']
colors, marked = analyzeSong.parse_and_analyze_lyrics(cmd=False,args=lyrics)
collection.update_one({"song" : doc['song']},{"$set" : {"colors" : colors}} )
print(".", end=r"")
except KeyError:
collection.delete_one({"_id" : ObjectId(doc['_id'])})
print("Invalid song deleted")
print("")
# %%
import config
import pymongo
import analyzeSong
from bson.objectid import ObjectId
mongoconnect = config.get("CLIENT", "MONGODB")
cluster = pymongo.MongoClient(mongoconnect)
db = cluster["Lyrics_Actual"]
colnames = db.list_collection_names()
print(len(colnames))
#MADE IT FROM NAS TO THE GAME
# colname = ["tyler, the creator"]
for col in colnames:
print(f"FIXING {col}")
collection = db[col]
print(collection.count())
for doc in collection.find():
if doc['colors'] == []:
collection.delete_one({"_id" : ObjectId(doc['_id'])})
print(f"Deleted: {doc['song']} id {doc['_id']}, had no colors")
# try:
# if len(str(doc['colors'][0][0])) > 4:
# collection.delete_one({"_id" : ObjectId(doc['_id'])})
# print(f"Deleted: {doc['song']} id {doc['_id']}")
# except:
# print(f"{doc['song']} invalid colors {doc['colors']}")
# # collection.delete_one({"_id" : ObjectId(doc['_id'])})
print(80 * "-")
# %%