RapAnalysis/analyzeSong.py at main · alexmarozick/RapAnalysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
"""
Juno Mayer
A test of kevin brown's rhyme detection
takes lyrics, tokenizes them, prints them out to console with marks to deliminate rhymes

"""
import os
import sys
from pprint import pprint as pp
from copy import deepcopy
import nltk
import logging
import json
import random


logging.basicConfig(level=logging.INFO)

#TODO: Look at checking by syllable, if its not possible
# Then check last TWO phonemes for rhyme
phone_dictionary = nltk.corpus.cmudict.dict()

def possible_phones(word):

    if word not in phone_dictionary:
        return []
    return phone_dictionary[word]


def phon_match(first_phon : list, second_phon : list,start : int, end: int,debug=False) -> int:
    logging.debug(f"FIRST: {first_phon} SECOND: {second_phon}")
    first_range = first_phon[start:end]
    second_range = second_phon[start:end]

    first_range = first_range[::-1]
    second_range = second_range[::-1]

    if debug:
        logging.debug(f"FIRST RANGE: {first_range}, SECOND RANGE: {second_range}")

    # we only want to loop through the smallest range
    if len(first_range) > len(second_range):
        first_range, second_range = second_range, first_range

    hits = 0
    total = len(first_range)
    #TODO: check if there is only one phoneneme in first_range,
    # if so we can take out the loop here
    for idx, phone in enumerate(first_range):
        other_phone = second_range[idx]

        if phone == other_phone:
            hits += 1

            # Phones with emphasis are better matches, weight them more
            if phone[-1].isdigit():
                hits += 1
                total += 1

    return hits/total

def word_similarity(first_word, second_word, start_phone=None, end_phone=None,debug=False):
    # print(f"looking up Phonemes for {first_word} and {second_word}")
    first_word = first_word.strip(".!?-()").replace("in'", "ing")
    second_word = second_word.strip(".!?-()").replace("in'", "ing")

    first_phones = possible_phones(first_word)
    second_phones = possible_phones(second_word)
    logging.debug(f"SIMILARITY BETWEEN: {first_word} {first_phones} {second_word} {second_phones}")

    if not first_phones or not second_phones:
        return 0

    # If there is only one pronouciation of both words
    if len(first_phones) + len(second_phones) == 2:
        first_phones = first_phones[0]
        second_phones = second_phones[0]
        return phon_match(first_phones, second_phones,start_phone,end_phone,debug=True)

    else:
    # multiple pronouciations for one or both words
    #we want to find if any pronouciations result in a rhyme
    # append all rhyme scores to list, if any of them > 0 its a rhyme
        scorelist = []
        for f_p in first_phones:
            for s_p in second_phones:
                scorelist.append(phon_match(f_p,s_p,start_phone,end_phone,debug=True))

        if 1 in scorelist:
            return 1
        else:
            return 0

def mark_with_rhymes(lyrics : str) -> str:
    """
    marks the lyrics with delims around words based on rhymes
    """

    #make copy unique to remove repeated lines
    logging.debug(lyrics)
    mark_copy = deepcopy(lyrics)

    found_rhymes = 0
    prev_found = 0
    numremoved = 0
    colorlist = [-1] * len(mark_copy)
    # logging.info(f"{colorlist},{mark_copy}")
    rhymecolor = random.randint(0,0xFFFFFF)

    #"dead in the middle of little italy"
    # middle == rhymer, little == rhymee
    for idx, rhymer in enumerate(lyrics):
        if '[' in rhymer or ']' in rhymer:
            print(rhymer)
        found = False
        # iter_seg = iter_copy[split_lyrics.index(word)-20:split_lyrics.index(word)+20]
        for rhymee in mark_copy[idx + 1:]:
    #       if word similarity(word, copyword) > 0 then its a rhyme
            if rhymee in lyrics:
                they_rhyme = False
                # we are concerned with end rhymes for now, only want to check if the last phoneme in the word matches
                # so start at the last phoneme of the shortest word (length of shortest - 1 OR 1 if its a 1 phoneme word)
                rhymer_phon = possible_phones(rhymer.strip(".!?-()").replace("in'", "ing"))
                rhymee_phon = possible_phones(rhymee.strip(".!?-()").replace("in'", "ing"))

                #criteria for rhyming depends on if the word is in the phoneme dictionary or not
                # if one of the words are not, the best we can do is check for exact equaliy (for now)
                if rhymer_phon == [] or rhymee_phon == []:
                    they_rhyme = rhymer == rhymee
                    logging.debug(f"{rhymer} or {rhymee} was not found in CMUDict")
                else:
                    #shortest pronounciation of each word
                    shortest_rhymer_phon = min(rhymer_phon, key=len)
                    shortest_rhymee_phon = min(rhymee_phon, key=len)
                    shortest_phon = min([shortest_rhymee_phon,shortest_rhymer_phon], key=len)
                    # rhymer_phon if len(rhymer_phon[0]) <= len(rhymee_phon[0]) else rhymee_phon

                    logging.debug(f"RHYMER: {rhymer} {rhymer_phon}, RHYMEE: {rhymee} {rhymee_phon}, SHORTEST: {shortest_phon}")
                    logging.debug(f"{len(shortest_phon)} phonemes in {shortest_phon}")
                    # if we have a 1 phoneneme word, we dont subtract off the end (last phoneme is the only phoneme)
                    nphones_in_shortest = 0 if (len(shortest_phon) - 1) == 0 else len(shortest_phon) - 1
                    #also do string literal comparison
                    they_rhyme = word_similarity(rhymer,rhymee, start_phone=(nphones_in_shortest -1), debug=True) == 1


                if they_rhyme:
                    found = True
                    #print(f"{rhymer} rhymes with {rhymee}")
                    #rhyme is between WORD in splitlyrics (original) and some other word (rhymee) in the iter_copy
                    #mark both words in rhyme pair with the same number
                    delimnated_rhymer = str(found_rhymes) + rhymer.replace("in'", "ING") + str(found_rhymes)
                    delimnated_rhymee = str(found_rhymes) + rhymee.replace("in'", "ING") + str(found_rhymes)
                    # replace ALL INSTANCES of the rhymee with a delimnated version of it
                    rhymee_indicies = [i for i, x in enumerate(mark_copy) if x == rhymee]
                    logging.debug(f"matching {rhymer}, {rhymee} is in {rhymee_indicies}")
                    for i in rhymee_indicies:
                        mark_copy[i] = delimnated_rhymee
                        colorlist[i] = found_rhymes

                    if mark_copy[idx] == rhymer:
                        mark_copy[idx] = delimnated_rhymer
                        colorlist[idx] = found_rhymes

                    else:
                        logging.debug(f"{rhymer, idx} alerady marked by another rhyme, rhymee: {rhymee}")
                        logging.debug(f"{rhymee_indicies}   indicies")

                        # if we dont find the word, its because its already been marked

                    logging.debug(f"word {delimnated_rhymer} rhymes with {delimnated_rhymee}")
                    #delete it so we save time
                    # iter_copy.remove(rhymee)
        if found:
            # this word did not rhyme and has not been delimnated / appended
            found_rhymes += 1
            rhymecolor = random.randint(0,0xFFFFFF)

    # retroactively remove cases where the rhymer is delimnated without a rhymee
    # i.e. only one instance of that num in colorlist
    for num in range(found_rhymes):
        indicies_of_rhymenum = [i for i, x in enumerate(colorlist) if num == x]
        if len(indicies_of_rhymenum) == 1:
            # marking as 0 means no highlight
            colorlist[indicies_of_rhymenum[0]] = 0
    return colorlist, mark_copy


def parse_lyrics(lyrics) -> list:
    '''
    returns a parsed lyrics list for a given lyrics string
    Genius separates song sections (Verse, Chorus, Intro, Outro, etc) by newlines and square brackets:
    [Verse 1]
    ...
    ...
    ...

    [Chorus]
    .......
    .......

    We split lyrics into lists based on these sections and remove section headers

    '''
    # split lyrics by "[]" to seperate verses and choruses
    # pprint.pprint(lyrics.split('\n'))
    split_newl = lyrics.lower().split('\n')
    size = len(split_newl)
    parsed_lyrics = []
    #get indicies of all instances of empty string (these are blank lines inbetween sections)
    idx_list = [idx + 1 for idx, val in enumerate(split_newl) if val == '']
    #generate new list seperated by sections
    try:
        sections = [split_newl[i: j] for i, j in zip([0] + idx_list, idx_list + ([size] if idx_list[-1] != size else []))]
    except IndexError:
        print("Invalid Lyrics")
        return None, None
    # pprint.pprint(sections)
        # print("THIS MANY SECTIONS IN THE SONG")
        # print(len(sections))
        #remove troublesome characters from lyrics
    for section in sections:
        words = []
        for l in section[1:]:
            for word in l.split():
                words.append(word.strip(",?\".()—"))
        parsed_lyrics.append(words)

    return parsed_lyrics

def analyze_lyrics(lyrics: list,showResult=False) -> list:
    """
    Takes in lyrics, and analyzes them for rhymes, returning a list[list] of Rhyme Numbers by section
    lyrics: [[list of words in section] for each section]
    ShowResult: whether or not to print results for that song to the console, used for -t flag
    returns:
    rhyme_numbers: [[list of rhyme numbers] for each section]
    rhyme_numbers and lyrics are one-to-one
    marked_lyrics: A String of lyrics where rhyming words are delimnated by their rhyme number
    used for debugging and development
    """
    rhyme_num_list = []
    marked_lyrics = []
    for section in lyrics:
        if section != [] and (section is not None):
            rhyme_numbers, marked = mark_with_rhymes(section)
            logging.debug(f"RhymeNumbers: {len(rhyme_numbers)}MARKED {len(section)} ")
            rhyme_num_list.append(rhyme_numbers)
            marked_lyrics.append(marked)
            if showResult:
                logging.debug(marked)
        else:
            logging.debug("Found an empty section")
    logging.debug(rhyme_num_list)

    return rhyme_num_list, marked_lyrics

def parse_and_analyze_lyrics(lyrics=None,cmd=False,args=None,genius=True) -> dict:
    """
    Parses and analyzes song lyrics for rhymes
    Command Line:
        -t Accepts a text file with the lyrics of a single song, copied from Genius.com
        (Used for Debugging and Development)
        -j Accepts Artist JSON Object created by the LyricsGenius library, obtained by
        using buildDB.py
        Use to build a local database of song lyrics for personal data analysis or project contribution

    Function Call:
        Accepts a lyrics string using the 'lyrics' argument, returns rhyme numbers and marked lyrics
        Used by buildDB.py to create our MongoDB database
    """
    colors_for_html = []
    marked_lyrics = []
    json_out = {}
    if cmd:
        #this function was run from command line
        #-t for .txt of lyrics copied from Genius.com
        #-j for json generated from lyricsGenius
        file = ""
        if args[1] == '-j':
            fp = open(args[2],'r')
            jp = json.load(fp)
            db = jp['Database']
            for idx, artist in enumerate(db):
                for artist,songs in artist.items():
                    print(f"Loaded {len(songs)} songs by {artist}")
                    print('Parsing...')
                    newsongs = []
                    for song in songs:
                        parsed = parse_lyrics(song['lyrics'])
                        print(f"Analyzing {song['song']}")
                        rhyme_num_list, marked_lyrics = analyze_lyrics(parsed)
                        newsongs.append({
                            "song" : song['song'],
                            "album" : song['album'],
                            "lyrics" : song['lyrics'],
                            "rhyme" : rhyme_num_list,
                            "marked" : marked_lyrics
                            })

                db[idx] = {artist : newsongs}
            with open(f"{args[2]}_reanalyzed.json",'w') as fp:
                json.dump({"Database": db},fp)
            return db

        elif args[1] == '-t':
            lyrics = open(args[2],'r').read()
            print("Loaded Lyrics from Text File")
            parsed = parse_lyrics(lyrics)
            rhyme_num_list, marked_lyrics = analyze_lyrics(parsed,showResult=True)
            with open(f'{args[2][:-4]}_analyzed.json','w') as fp:
                json.dump(
                    {"filename" : args[2],
                    "lyrics": lyrics,
                    "marked" : marked_lyrics,
                    "rhyme" : rhyme_num_list}, fp)
            print(marked_lyrics)
            return rhyme_num_list, marked_lyrics

        else:
            print("USAGE: python3 analyzeSong.py [-t][-j] FILENAME \n\n \
    -t : .txt file of lyrics copied from Genius.com \n \
    OR \n \
    -j : .json generated from BuildDB.py")
            return -1
    else:
        songlyrics = parse_lyrics(lyrics)
        rhyme_num, marked = analyze_lyrics(songlyrics)

    # print(marked)
        return rhyme_num,marked


if __name__ == "__main__":
    parse_and_analyze_lyrics(cmd=True,args=sys.argv)