Merge pull request #53 from A-Baji/dev

A-Baji · web-flow · commit bcb93bacf2d8 · 2024-07-11T17:40:46.000-05:00
3.0.7
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,13 @@
 
 Observes [Semantic Versioning](https://semver.org/spec/v2.0.0.html) standard and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) convention.
 
+## [3.0.7] - 07-11-2024
+
+### Changed
+
+- updated hate speech censoring
+- updated file model tagging
+
 ## [3.0.6] - 07-10-2024
 
 ### Changed
diff --git a/discordai_modelizer/customize.py b/discordai_modelizer/customize.py
@@ -31,7 +31,7 @@ def create_model(
     use_existing=False,
 ):
     client = OpenAI(api_key=openai_key)
-    channel_user = f"{channel_id[:4]}_{user_id}"
+    channel_user = f"{user_id[:13]}_{channel_id[:4]}"
     files_path = pathlib.Path(appdirs.user_data_dir(appname="discordai"))
     full_logs_path = files_path / f"{channel_id}_logs.json"
     full_dataset_path = files_path / f"{channel_user}_data_set.jsonl"
@@ -112,7 +112,7 @@ def create_model(
         fine_tune = client.fine_tuning.jobs.create(
             model=MODEL_MAP[base_model],
             training_file=upload_response.id,
-            suffix=channel_user[:18],
+            suffix=channel_user,
         )
         print(
             "INFO: This may take a few minutes to hours depending on the size of the dataset and the selected base model"
diff --git a/discordai_modelizer/gen_dataset.py b/discordai_modelizer/gen_dataset.py
@@ -1,11 +1,10 @@
 from appdirs import user_data_dir
-from re import sub
 from json import load, dumps
 from datetime import timedelta
 from dateutil import parser
 from string import punctuation
 from os import path
-from better_profanity import profanity
+import re
 import pathlib
 
 
@@ -31,14 +30,31 @@ def validate_thought(thought: str) -> bool:
         if word_count >= thought_min and thought_max >= word_count:
             return True
 
-    def clean_message(msg: dict) -> dict:
+    def cleanup_string(msg: str) -> str:
         """
-        Remove URLs from a message and,
-            return the message
+        Remove URLs and slurs from a string and,
+            return the string
         """
-        msg["content"] = profanity.censor(
-            sub(r"\bhttps?://\S+|\bftp://\S+|\bfile://\S+", "", msg["content"])
-        )
+        hate_speech_words = ["nigg", "fag", "gay", "tard"]
+
+        def censor_hate(match):
+            word = match.group()
+            # Find all vowels and replace them along with the next two characters
+            censored_word = re.sub(
+                r"([aeiou]).{0,2}",
+                lambda m: "*" * len(m.group()),
+                word,
+                flags=re.IGNORECASE,
+            )
+            return censored_word
+
+        url_pattern = re.compile(r"\bhttps?://\S+|\bftp://\S+|\bfile://\S+")
+        msg = url_pattern.sub("", msg)
+
+        for word in hate_speech_words:
+            pattern = re.compile(rf"(\b{re.escape(word)}\w{{0,1}})", re.IGNORECASE)
+            msg = pattern.sub(censor_hate, msg)
+
         return msg
 
     def build_thought(thought: str, msg: dict) -> str:
@@ -58,17 +74,17 @@ def build_json(thought: str) -> str:
         """
         if thought[-1] not in punctuation:
             thought += "."
-        return dumps({"prompt": f"{username} says:", "completion": thought}) + "\n"
+        return dumps({"prompt": f"{user[:13]} says:", "completion": thought}) + "\n"
 
     def add_to_dataset(thought: str):
         """
         Validate a thought, create a dataset JSON entry, and then add it to the dataset
         """
         if validate_thought(thought):
-            dataset.write(build_json(thought))
+            dataset.write(build_json(cleanup_string(thought)))
 
     files_path = pathlib.Path(user_data_dir(appname="discordai"))
-    dataset = open(files_path / f"{channel[:4]}_{user}_data_set.jsonl", "w")
+    dataset = open(files_path / f"{user[:13]}_{channel[:4]}_data_set.jsonl", "w")
     thought_max = 999999 if not thought_max else thought_max
     if "#" in user:
         username, user_id = user.split("#")
@@ -77,7 +93,7 @@ def add_to_dataset(thought: str):
     with open(file, "r", encoding="utf-8") as data_file:
         data = load(data_file)
         messages = [
-            clean_message(msg)
+            msg
             for msg in data["messages"]
             if msg["author"].get("name") == username
             and (user_id is None or msg["author"].get("discriminator") == user_id)
@@ -101,7 +117,7 @@ def add_to_dataset(thought: str):
                     thought = build_thought(thought, msg)
         add_to_dataset(thought)
     dataset.close()
-    if path.getsize(files_path / f"{channel[:4]}_{user}_data_set.jsonl") == 0:
+    if path.getsize(files_path / f"{user[:13]}_{channel[:4]}_data_set.jsonl") == 0:
         print(
             "WARNING: The resulting dataset is empty. Please double check your parameters."
         )
diff --git a/discordai_modelizer/version.py b/discordai_modelizer/version.py
@@ -1 +1 @@
-__version__ = "3.0.6"
+__version__ = "3.0.7"
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,3 @@
 openai==1.35.5
 appdirs
-python-dateutil
-better_profanity
+python-dateutil
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -9,7 +9,7 @@
 USER = os.environ["USERNAME"]
 FILES_PATH = pathlib.Path(appdirs.user_data_dir(appname="discordai"))
 FULL_LOGS_PATH = FILES_PATH / f"{CHANNEL_ID}_logs.json"
-FULL_DATASET_PATH = FILES_PATH / f"{CHANNEL_ID[:4]}_{USER}_data_set.jsonl"
+FULL_DATASET_PATH = FILES_PATH / f"{USER[:13]}_{CHANNEL_ID[:4]}_data_set.jsonl"
 
 
 def list_dict_comp(x, y):
diff --git a/tests/expected_values.py b/tests/expected_values.py
diff --git a/tests/test_dataset.py b/tests/test_dataset.py

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "3.0.6"`
	`1`	`+__version__ = "3.0.7"`