Skip to content

Commit bcb93ba

Browse files
authored
Merge pull request #53 from A-Baji/dev
3.0.7
2 parents 0660a8b + acfc110 commit bcb93ba

File tree

8 files changed

+329
-235
lines changed

8 files changed

+329
-235
lines changed

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,13 @@
22

33
Observes [Semantic Versioning](https://semver.org/spec/v2.0.0.html) standard and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) convention.
44

5+
## [3.0.7] - 07-11-2024
6+
7+
### Changed
8+
9+
- updated hate speech censoring
10+
- updated file model tagging
11+
512
## [3.0.6] - 07-10-2024
613

714
### Changed

discordai_modelizer/customize.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ def create_model(
3131
use_existing=False,
3232
):
3333
client = OpenAI(api_key=openai_key)
34-
channel_user = f"{channel_id[:4]}_{user_id}"
34+
channel_user = f"{user_id[:13]}_{channel_id[:4]}"
3535
files_path = pathlib.Path(appdirs.user_data_dir(appname="discordai"))
3636
full_logs_path = files_path / f"{channel_id}_logs.json"
3737
full_dataset_path = files_path / f"{channel_user}_data_set.jsonl"
@@ -112,7 +112,7 @@ def create_model(
112112
fine_tune = client.fine_tuning.jobs.create(
113113
model=MODEL_MAP[base_model],
114114
training_file=upload_response.id,
115-
suffix=channel_user[:18],
115+
suffix=channel_user,
116116
)
117117
print(
118118
"INFO: This may take a few minutes to hours depending on the size of the dataset and the selected base model"

discordai_modelizer/gen_dataset.py

Lines changed: 29 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,10 @@
11
from appdirs import user_data_dir
2-
from re import sub
32
from json import load, dumps
43
from datetime import timedelta
54
from dateutil import parser
65
from string import punctuation
76
from os import path
8-
from better_profanity import profanity
7+
import re
98
import pathlib
109

1110

@@ -31,14 +30,31 @@ def validate_thought(thought: str) -> bool:
3130
if word_count >= thought_min and thought_max >= word_count:
3231
return True
3332

34-
def clean_message(msg: dict) -> dict:
33+
def cleanup_string(msg: str) -> str:
3534
"""
36-
Remove URLs from a message and,
37-
return the message
35+
Remove URLs and slurs from a string and,
36+
return the string
3837
"""
39-
msg["content"] = profanity.censor(
40-
sub(r"\bhttps?://\S+|\bftp://\S+|\bfile://\S+", "", msg["content"])
41-
)
38+
hate_speech_words = ["nigg", "fag", "gay", "tard"]
39+
40+
def censor_hate(match):
41+
word = match.group()
42+
# Find all vowels and replace them along with the next two characters
43+
censored_word = re.sub(
44+
r"([aeiou]).{0,2}",
45+
lambda m: "*" * len(m.group()),
46+
word,
47+
flags=re.IGNORECASE,
48+
)
49+
return censored_word
50+
51+
url_pattern = re.compile(r"\bhttps?://\S+|\bftp://\S+|\bfile://\S+")
52+
msg = url_pattern.sub("", msg)
53+
54+
for word in hate_speech_words:
55+
pattern = re.compile(rf"(\b{re.escape(word)}\w{{0,1}})", re.IGNORECASE)
56+
msg = pattern.sub(censor_hate, msg)
57+
4258
return msg
4359

4460
def build_thought(thought: str, msg: dict) -> str:
@@ -58,17 +74,17 @@ def build_json(thought: str) -> str:
5874
"""
5975
if thought[-1] not in punctuation:
6076
thought += "."
61-
return dumps({"prompt": f"{username} says:", "completion": thought}) + "\n"
77+
return dumps({"prompt": f"{user[:13]} says:", "completion": thought}) + "\n"
6278

6379
def add_to_dataset(thought: str):
6480
"""
6581
Validate a thought, create a dataset JSON entry, and then add it to the dataset
6682
"""
6783
if validate_thought(thought):
68-
dataset.write(build_json(thought))
84+
dataset.write(build_json(cleanup_string(thought)))
6985

7086
files_path = pathlib.Path(user_data_dir(appname="discordai"))
71-
dataset = open(files_path / f"{channel[:4]}_{user}_data_set.jsonl", "w")
87+
dataset = open(files_path / f"{user[:13]}_{channel[:4]}_data_set.jsonl", "w")
7288
thought_max = 999999 if not thought_max else thought_max
7389
if "#" in user:
7490
username, user_id = user.split("#")
@@ -77,7 +93,7 @@ def add_to_dataset(thought: str):
7793
with open(file, "r", encoding="utf-8") as data_file:
7894
data = load(data_file)
7995
messages = [
80-
clean_message(msg)
96+
msg
8197
for msg in data["messages"]
8298
if msg["author"].get("name") == username
8399
and (user_id is None or msg["author"].get("discriminator") == user_id)
@@ -101,7 +117,7 @@ def add_to_dataset(thought: str):
101117
thought = build_thought(thought, msg)
102118
add_to_dataset(thought)
103119
dataset.close()
104-
if path.getsize(files_path / f"{channel[:4]}_{user}_data_set.jsonl") == 0:
120+
if path.getsize(files_path / f"{user[:13]}_{channel[:4]}_data_set.jsonl") == 0:
105121
print(
106122
"WARNING: The resulting dataset is empty. Please double check your parameters."
107123
)

discordai_modelizer/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "3.0.6"
1+
__version__ = "3.0.7"

requirements.txt

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
11
openai==1.35.5
22
appdirs
3-
python-dateutil
4-
better_profanity
3+
python-dateutil

tests/conftest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
USER = os.environ["USERNAME"]
1010
FILES_PATH = pathlib.Path(appdirs.user_data_dir(appname="discordai"))
1111
FULL_LOGS_PATH = FILES_PATH / f"{CHANNEL_ID}_logs.json"
12-
FULL_DATASET_PATH = FILES_PATH / f"{CHANNEL_ID[:4]}_{USER}_data_set.jsonl"
12+
FULL_DATASET_PATH = FILES_PATH / f"{USER[:13]}_{CHANNEL_ID[:4]}_data_set.jsonl"
1313

1414

1515
def list_dict_comp(x, y):

0 commit comments

Comments
 (0)