11from appdirs import user_data_dir
2- from re import sub
32from json import load , dumps
43from datetime import timedelta
54from dateutil import parser
65from string import punctuation
76from os import path
8- from better_profanity import profanity
7+ import re
98import pathlib
109
1110
@@ -31,14 +30,31 @@ def validate_thought(thought: str) -> bool:
3130 if word_count >= thought_min and thought_max >= word_count :
3231 return True
3332
34- def clean_message (msg : dict ) -> dict :
33+ def cleanup_string (msg : str ) -> str :
3534 """
36- Remove URLs from a message and,
37- return the message
35+ Remove URLs and slurs from a string and,
36+ return the string
3837 """
39- msg ["content" ] = profanity .censor (
40- sub (r"\bhttps?://\S+|\bftp://\S+|\bfile://\S+" , "" , msg ["content" ])
41- )
38+ hate_speech_words = ["nigg" , "fag" , "gay" , "tard" ]
39+
40+ def censor_hate (match ):
41+ word = match .group ()
42+ # Find all vowels and replace them along with the next two characters
43+ censored_word = re .sub (
44+ r"([aeiou]).{0,2}" ,
45+ lambda m : "*" * len (m .group ()),
46+ word ,
47+ flags = re .IGNORECASE ,
48+ )
49+ return censored_word
50+
51+ url_pattern = re .compile (r"\bhttps?://\S+|\bftp://\S+|\bfile://\S+" )
52+ msg = url_pattern .sub ("" , msg )
53+
54+ for word in hate_speech_words :
55+ pattern = re .compile (rf"(\b{ re .escape (word )} \w{{0,1}})" , re .IGNORECASE )
56+ msg = pattern .sub (censor_hate , msg )
57+
4258 return msg
4359
4460 def build_thought (thought : str , msg : dict ) -> str :
@@ -58,17 +74,17 @@ def build_json(thought: str) -> str:
5874 """
5975 if thought [- 1 ] not in punctuation :
6076 thought += "."
61- return dumps ({"prompt" : f"{ username } says:" , "completion" : thought }) + "\n "
77+ return dumps ({"prompt" : f"{ user [: 13 ] } says:" , "completion" : thought }) + "\n "
6278
6379 def add_to_dataset (thought : str ):
6480 """
6581 Validate a thought, create a dataset JSON entry, and then add it to the dataset
6682 """
6783 if validate_thought (thought ):
68- dataset .write (build_json (thought ))
84+ dataset .write (build_json (cleanup_string ( thought ) ))
6985
7086 files_path = pathlib .Path (user_data_dir (appname = "discordai" ))
71- dataset = open (files_path / f"{ channel [: 4 ]} _{ user } _data_set.jsonl" , "w" )
87+ dataset = open (files_path / f"{ user [: 13 ]} _{ channel [: 4 ] } _data_set.jsonl" , "w" )
7288 thought_max = 999999 if not thought_max else thought_max
7389 if "#" in user :
7490 username , user_id = user .split ("#" )
@@ -77,7 +93,7 @@ def add_to_dataset(thought: str):
7793 with open (file , "r" , encoding = "utf-8" ) as data_file :
7894 data = load (data_file )
7995 messages = [
80- clean_message ( msg )
96+ msg
8197 for msg in data ["messages" ]
8298 if msg ["author" ].get ("name" ) == username
8399 and (user_id is None or msg ["author" ].get ("discriminator" ) == user_id )
@@ -101,7 +117,7 @@ def add_to_dataset(thought: str):
101117 thought = build_thought (thought , msg )
102118 add_to_dataset (thought )
103119 dataset .close ()
104- if path .getsize (files_path / f"{ channel [: 4 ]} _{ user } _data_set.jsonl" ) == 0 :
120+ if path .getsize (files_path / f"{ user [: 13 ]} _{ channel [: 4 ] } _data_set.jsonl" ) == 0 :
105121 print (
106122 "WARNING: The resulting dataset is empty. Please double check your parameters."
107123 )
0 commit comments