22import logging
33from concurrent .futures import ThreadPoolExecutor
44from typing import Any
5+ from pydantic import BaseModel
6+ from typing import Optional
57
6- import openai
8+ from openai import OpenAI , AzureOpenAI
79
8- logger = logging . getLogger ( __name__ )
10+ from module . models import Bangumi
911
10- DEFAULT_PROMPT = """\
11- You will now play the role of a super assistant.
12- Your task is to extract structured data from unstructured text content and output it in JSON format.
13- If you are unable to extract any information, please keep all fields and leave the field empty or default value like `''`, `None`.
14- But Do not fabricate data!
15-
16- the python structured data type is:
12+ logger = logging .getLogger (__name__ )
1713
18- ```python
19- @dataclass
20- class Episode:
14+ class Episode (BaseModel ):
2115 title_en : Optional [str ]
2216 title_zh : Optional [str ]
2317 title_jp : Optional [str ]
24- season: int
18+ season : str
2519 season_raw : str
26- episode: int
20+ episode : str
2721 sub : str
2822 group : str
2923 resolution : str
3024 source : str
31- ```
32-
33- Example:
3425
35- ```
36- input: "【喵萌奶茶屋】★04月新番★[夏日重现/Summer Time Rendering][11][1080p][繁日双语][招募翻译]"
37- output: '{"group": "喵萌奶茶屋", "title_en": "Summer Time Rendering", "resolution": "1080p", "episode": 11, "season": 1, "title_zh": "夏日重现", "sub": "", "title_jp": "", "season_raw": "", "source": ""}'
3826
39- input: "【幻樱字幕组】【4月新番】【古见同学有交流障碍症 第二季 Komi-san wa, Komyushou Desu. S02】【22】【GB_MP4】【1920X1080】"
40- output: '{"group": "幻樱字幕组", "title_en": "Komi-san wa, Komyushou Desu.", "resolution": "1920X1080", "episode": 22, "season": 2, "title_zh": "古见同学有交流障碍症", "sub": "", "title_jp": "", "season_raw": "", "source": ""}'
41-
42- input: "[Lilith-Raws] 关于我在无意间被隔壁的天使变成废柴这件事 / Otonari no Tenshi-sama - 09 [Baha][WEB-DL][1080p][AVC AAC][CHT][MP4]"
43- output: '{"group": "Lilith-Raws", "title_en": "Otonari no Tenshi-sama", "resolution": "1080p", "episode": 9, "season": 1, "source": "WEB-DL", "title_zh": "关于我在无意间被隔壁的天使变成废柴这件事", "sub": "CHT", "title_jp": ""}'
44- ```
27+ DEFAULT_PROMPT = """\
28+ You will now play the role of a super assistant.
29+ Your task is to extract structured data from unstructured text content and output it in JSON format.
30+ If you are unable to extract any information, please keep all fields and leave the field empty or default value like `''`, `None`.
31+ But Do not fabricate data!
4532"""
4633
4734
@@ -50,7 +37,8 @@ def __init__(
5037 self ,
5138 api_key : str ,
5239 api_base : str = "https://api.openai.com/v1" ,
53- model : str = "gpt-3.5-turbo" ,
40+ model : str = "gpt-4o-mini" ,
41+ api_type : str = "openai" ,
5442 ** kwargs ,
5543 ) -> None :
5644 """OpenAIParser is a class to parse text with openai
@@ -63,7 +51,7 @@ def __init__(
6351 model (str):
6452 the ChatGPT model parameter, you can get more details from \
6553 https://platform.openai.com/docs/api-reference/chat/create. \
66- Defaults to "gpt-3.5-turbo ".
54+ Defaults to "gpt-4o-mini ".
6755 kwargs (dict):
6856 the OpenAI ChatGPT parameters, you can get more details from \
6957 https://platform.openai.com/docs/api-reference/chat/create.
@@ -73,9 +61,16 @@ def __init__(
7361 """
7462 if not api_key :
7563 raise ValueError ("API key is required." )
64+ if api_type == "azure" :
65+ self .client = AzureOpenAI (
66+ api_key = api_key ,
67+ base_url = api_base ,
68+ azure_deployment = kwargs .get ("deployment_id" , "" ),
69+ api_version = kwargs .get ("api_version" , "2023-05-15" ),
70+ )
71+ else :
72+ self .client = OpenAI (api_key = api_key , base_url = api_base )
7673
77- self ._api_key = api_key
78- self .api_base = api_base
7974 self .model = model
8075 self .openai_kwargs = kwargs
8176
@@ -102,10 +97,10 @@ def parse(
10297 params = self ._prepare_params (text , prompt )
10398
10499 with ThreadPoolExecutor (max_workers = 1 ) as worker :
105- future = worker .submit (openai . ChatCompletion . create , ** params )
100+ future = worker .submit (self . client . beta . chat . completions . parse , ** params )
106101 resp = future .result ()
107102
108- result = resp [ " choices" ] [0 ][ " message" ][ "content" ]
103+ result = resp . choices [0 ]. message . parsed
109104
110105 if asdict :
111106 try :
@@ -130,12 +125,12 @@ def _prepare_params(self, text: str, prompt: str) -> dict[str, Any]:
130125 dict[str, Any]: the prepared key value pairs.
131126 """
132127 params = dict (
133- api_key = self ._api_key ,
134- api_base = self .api_base ,
128+ model = self .model ,
135129 messages = [
136130 dict (role = "system" , content = prompt ),
137131 dict (role = "user" , content = text ),
138132 ],
133+ response_format = Episode ,
139134
140135 # set temperature to 0 to make results be more stable and reproducible.
141136 temperature = 0 ,
0 commit comments