Skip to content

Commit c922e11

Browse files
committed
feat: Added LaTeX error modification using deepseek-reasoner
1 parent f863463 commit c922e11

File tree

1 file changed

+160
-7
lines changed

1 file changed

+160
-7
lines changed

writer.py

Lines changed: 160 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77
import os
88
import glob
99
import yaml
10+
import re
11+
import subprocess
12+
import tempfile
1013

1114
path_to = f'src/content/blog/{datetime.datetime.now().strftime("%Y-%m-%d")}'
1215

@@ -84,35 +87,185 @@ def outline(topic):
8487
def write_from_outline(outline):
8588
global deepseek, existing_posts_text
8689
return generate([
87-
{"role": "system", "content": "你是一位专业技术博客作者。在写作时请遵循以下中文排版规范:1) 中文与英文、数字之间需要有空格;2) 中文标点与英文、数字之间不加空格;3) 使用全角中文标点;4) 专有名词大小写正确;5) 英文、数字使用半角字符;6) 使用直角引号「」。"},
88-
{"role": "user", "content": f"{outline}\n\n根据这个提纲中关于技术知识的部分,写出一篇技术博客文章。文章中避免出现图片,避免使用列表。每一段出现的代码都进行较为详细的解读。在讲述内容时尽量使用段落的语言,语言风格可以略偏专业,但保持清晰。使用Markdown(要求符合Common Markdown规范)输出,使用LaTeX公式(注意:数学的开闭定界符前后不能有字母或数字字符。像x$a + b = c$或$a + b = c$1将无法渲染为数学公式(所有$会被渲染为$);但x $\\infty$ 1和($\\infty$)会正常渲染),标题尽量只用一级标题 `#` 和二级标题 `##`,不要用分割线。请遵循中文排版规范,确保中英文之间有空格,使用正确的标点符号。直接输出正文。"}
90+
{"role": "system", "content": "你是一位专业技术博客作者。在写作时请遵循以下中文排版规范:使用全角中文标点;专有名词大小写正确;英文、数字使用半角字符;使用直角引号「」。"},
91+
{"role": "user", "content": f"{outline}\n\n根据这个提纲中关于技术知识的部分,写出一篇技术博客文章。文章中避免出现图片,不能使用任何列表。每一段出现的代码都进行较为详细的解读。在讲述内容时尽量使用段落的语言,语言风格可以略偏专业,但保持清晰。使用Markdown(要求符合Common Markdown规范)输出,使用LaTeX公式(注意:数学的开闭定界符前后不能有字母或数字字符。像x$a + b = c$或$a + b = c$1将无法渲染为数学公式(所有$会被渲染为$);但x $\\infty$ 1和($\\infty$)会正常渲染),标题尽量只用一级标题 `#` 和二级标题 `##`,不要用分割线。请遵循中文排版规范,使用正确的标点符号。直接输出正文。"}
8992
], deepseek, "deepseek-reasoner")
9093

9194
def summary(article):
9295
global deepseek
9396
return generate([
94-
{"role": "system", "content": "你是一个技术博客简介写作者,简介不一定需要涵盖文章的全部内容,能起到一定的提示作用即可。直接输出简介。遵循以下中文排版规范:1) 中文与英文、数字之间需要有空格;2) 中文标点与英文、数字之间不加空格;3) 使用全角中文标点;4) 专有名词大小写正确;5) 英文、数字使用半角字符。注意简介被作为副标题使用,不是一句句子,不要以句号结尾。"},
97+
{"role": "system", "content": "你是一个技术博客简介写作者,简介不一定需要涵盖文章的全部内容,能起到一定的提示作用即可。直接输出简介。遵循以下中文排版规范:使用全角中文标点;专有名词大小写正确;英文、数字使用半角字符。注意简介被作为副标题使用,不是一句句子,不要以句号结尾。"},
9598
{"role": "user", "content": f"给这篇文章写一个15字的简短介绍:\n\n{article}"}
9699
], deepseek, "deepseek-chat")
97100

101+
# LaTeX error handling
102+
def remove_latex_comments(latex_str: str) -> str:
103+
lines = latex_str.splitlines()
104+
cleaned_lines = []
105+
for line in lines:
106+
m = re.search(r'(?<!\\)%', line)
107+
if m:
108+
line = line[:m.start()]
109+
cleaned_lines.append(line)
110+
return "\n".join(cleaned_lines)
111+
112+
def check_balanced_braces(latex_str: str) -> (bool, list):
113+
stack = []
114+
errors = []
115+
for index, char in enumerate(latex_str):
116+
if char == '{':
117+
stack.append(index)
118+
elif char == '}':
119+
if not stack:
120+
errors.append(f"位置 {index}: 右大括号 '}}' 没有对应的左大括号")
121+
else:
122+
stack.pop()
123+
if stack:
124+
for pos in stack:
125+
errors.append(f"位置 {pos}: 左大括号 '{{' 没有对应的右大括号")
126+
return (len(errors) == 0), errors
127+
128+
def check_environment_matching(latex_str: str) -> (bool, list):
129+
errors = []
130+
env_stack = []
131+
pattern = re.compile(r'\\(begin|end)\s*{([^}]+)}')
132+
for m in pattern.finditer(latex_str):
133+
cmd = m.group(1)
134+
env = m.group(2).strip()
135+
pos = m.start()
136+
if cmd == "begin":
137+
env_stack.append((env, pos))
138+
else: # cmd == "end"
139+
if not env_stack:
140+
errors.append(f"位置 {pos}: \\end{{{env}}} 没有对应的 \\begin")
141+
else:
142+
last_env, last_pos = env_stack.pop()
143+
if last_env != env:
144+
errors.append(f"位置 {last_pos}\\begin{{{last_env}}} 与位置 {pos}\\end{{{env}}} 不匹配")
145+
if env_stack:
146+
for env, pos in env_stack:
147+
errors.append(f"位置 {pos}: \\begin{{{env}}} 没有对应的 \\end")
148+
return (len(errors) == 0), errors
149+
150+
def run_static_checks(latex_snippet: str) -> list:
151+
cleaned = remove_latex_comments(latex_snippet)
152+
errors = []
153+
ok_braces, brace_errors = check_balanced_braces(cleaned)
154+
ok_env, env_errors = check_environment_matching(cleaned)
155+
if not ok_braces:
156+
errors.extend(["大括号错误: " + err for err in brace_errors])
157+
if not ok_env:
158+
errors.extend(["环境匹配错误: " + err for err in env_errors])
159+
return errors
160+
161+
def check_with_pdflatex(latex_snippet: str) -> list:
162+
"""
163+
call pdflatex for compilation checking and return the error messages detected in the compilation log.
164+
"""
165+
template = r"""
166+
\documentclass{article}
167+
\usepackage{amsmath}
168+
\begin{document}
169+
%s
170+
\end{document}
171+
""" % latex_snippet
172+
173+
errors = []
174+
with tempfile.TemporaryDirectory() as tmpdirname:
175+
tex_file = os.path.join(tmpdirname, "temp.tex")
176+
with open(tex_file, "w", encoding="utf-8") as f:
177+
f.write(template)
178+
try:
179+
proc = subprocess.run(
180+
["pdflatex", "-interaction=nonstopmode", tex_file],
181+
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
182+
cwd=tmpdirname, timeout=15
183+
)
184+
output = proc.stdout.decode("utf-8") + proc.stderr.decode("utf-8")
185+
for line in output.splitlines():
186+
if line.startswith("!"):
187+
errors.append(line.strip())
188+
if proc.returncode != 0 and not errors:
189+
errors.append("pdflatex 返回非 0 错误码,编译可能存在问题。")
190+
except Exception as e:
191+
errors.append(f"调用 pdflatex 编译时出错: {e}")
192+
return errors
193+
194+
def extract_latex_segments(markdown_text: str) -> list:
195+
"""
196+
extract latex segments from markdown
197+
"""
198+
segments = []
199+
block_pattern = re.compile(r'\$\$([\s\S]+?)\$\$', re.MULTILINE)
200+
segments.extend(block_pattern.findall(markdown_text))
201+
inline_pattern = re.compile(r'(?<!\$)\$([^$\n]+?)\$(?!\$)')
202+
segments.extend(inline_pattern.findall(markdown_text))
203+
return segments
204+
205+
def latex_errors(markdown_text: str) -> dict:
206+
segments = extract_latex_segments(markdown_text)
207+
report = {}
208+
for idx, seg in enumerate(segments):
209+
seg = seg.strip()
210+
static_errors = run_static_checks(seg)
211+
pdflatex_errors = check_with_pdflatex(seg)
212+
report[f"公式段 {idx+1}"] = {
213+
"原始内容": seg,
214+
"静态检测错误": static_errors,
215+
"pdflatex 检测错误": pdflatex_errors
216+
}
217+
return report
218+
219+
def modify_latex(markdown_text: str, error):
220+
global deepseek
221+
return generate([
222+
{"role": "system", "content": "你是LaTeX校验员。以下是一段Markdown文本,其中的LaTeX代码有错误,请基于报错修正。同时文本要遵循以下中文排版规范:使用全角中文标点;专有名词大小写正确;英文、数字使用半角字符。直接在输出中输出文本内容。"},
223+
{"role": "user", "content": f"<原文>\n{markdown_text}\n</原文>\n\n<报错>\n{error}\n</报错>"}
224+
], deepseek, "deepseek-reasoner")
225+
226+
is_latin = lambda ch: '\u0000' <= ch <= '\u007F' or '\u00A0' <= ch <= '\u024F'
227+
is_nonspace_latin = lambda ch: is_latin(ch) and not ch.isspace() and not ch in """*()[]{}"'/-@#"""
228+
is_nonpunct_cjk = lambda ch: not is_latin(ch) and ch not in "·!¥…()—【】、;:‘’“”,。《》?「」"
229+
230+
def beautify_string(text):
231+
res = ""
232+
for idx in range(len(text)):
233+
if idx and (
234+
(is_nonspace_latin(text[idx]) and is_nonpunct_cjk(text[idx - 1])) or
235+
(is_nonspace_latin(text[idx - 1]) and is_nonpunct_cjk(text[idx]))
236+
): res += " "
237+
res += text[idx]
238+
return res
239+
98240
start = time.time()
99241
print(" Generating topic:")
100-
topic = extract_topic(topics_text)
242+
topic = beautify_string(extract_topic(topics_text))
101243
print(f" Determined topic: {topic}; time spent {time.time() - start:.1f} s")
102244

103245
start = time.time()
104246
print(" Generating outline:")
105-
outline_result = outline(topic)
247+
outline_result = beautify_string(outline(topic))
106248
print(f" Determined outline: time spent {time.time() - start:.1f} s")
107249

108250
start = time.time()
109251
print(" Generating article:")
110252
article = write_from_outline(outline_result)
111253
print(f" Article written: time spent {time.time() - start:.1f} s")
112254

255+
if latex_errors(article):
256+
print(" latex_errors exist")
257+
start = time.time()
258+
article = modify_latex(article, latex_errors(article))
259+
print(f" LaTeX errors fixed: time spent {time.time() - start:.1f} s")
260+
261+
start = time.time()
262+
article = beautify_string(article)
263+
print(f" Article beautified: time spent {time.time() - start:.1f} s")
264+
265+
113266
start = time.time()
114267
print(" Generating summary:")
115-
summary_result = summary(article)
268+
summary_result = beautify_string(summary(article))
116269
print(f" Decided Summary: {summary_result}; time spent {time.time() - start:.1f} s")
117270

118271
lines = iter(article.split("\n"))
@@ -146,4 +299,4 @@ def summary(article):
146299
with open(f"{path_to}/index.md", "w", encoding="utf-8") as f:
147300
f.write(markdown_file)
148301

149-
print(f" Composed article: {path_to}/index.md")
302+
print(f" Composed article: {path_to}/index.md")

0 commit comments

Comments
 (0)