feat(translator): 优化 Gemini 翻译模板（吴恩达三步翻译法）并支持新模型 (#439)

* feat(translator): 优化 Gemini 翻译模板并支持新模型更新翻译提示模板以提供更专业的中文翻译输出，主要变更包括： - 添加新的 Gemini 2.0 flash 实验模型支持 - 修改翻译提示模板，采用三步翻译流程提升翻译质量 - 增加标签提取功能，只返回最终优化后的翻译内容 - 移除对 {language} 参数的强制要求检查优化后的翻译流程包含初次翻译、反思改进和最终润色三个步骤，显著提升翻译结果的准确性和可读性。 * support md file type --------- Co-authored-by: zhonghua.zhu <zhonghua.zhu@riveretech.com>
yihong0618 · Dec 22, 2024 · 4e7cbb5 · 4e7cbb5
1 parent b80f1ba
commit 4e7cbb5
Show file tree

Hide file tree

Showing 6 changed files with 196 additions and 5 deletions.
diff --git a/book_maker/cli.py b/book_maker/cli.py
@@ -35,8 +35,9 @@ def parse_prompt_arg(prompt_arg):
     else:
         raise FileNotFoundError(f"{prompt_arg} not found")
 
-    if prompt is None or any(c not in prompt["user"] for c in ["{text}", "{language}"]):
-        raise ValueError("prompt must contain `{text}` and `{language}`")
+    #if prompt is None or any(c not in prompt["user"] for c in ["{text}", "{language}"]):
+    if prompt is None or any(c not in prompt["user"] for c in ["{text}"]):
+        raise ValueError("prompt must contain `{text}`")
 
     if "user" not in prompt:
         raise ValueError("prompt must contain the key of `user`")

diff --git a/book_maker/loader/__init__.py b/book_maker/loader/__init__.py
@@ -1,10 +1,12 @@
 from book_maker.loader.epub_loader import EPUBBookLoader
 from book_maker.loader.txt_loader import TXTBookLoader
 from book_maker.loader.srt_loader import SRTBookLoader
+from book_maker.loader.md_loader import MarkdownBookLoader
 
 BOOK_LOADER_DICT = {
     "epub": EPUBBookLoader,
     "txt": TXTBookLoader,
     "srt": SRTBookLoader,
+    "md": MarkdownBookLoader,
     # TODO add more here
 }
diff --git a/book_maker/loader/md_loader.py b/book_maker/loader/md_loader.py
@@ -0,0 +1,176 @@
+import sys
+from pathlib import Path
+
+from book_maker.utils import prompt_config_to_kwargs
+
+from .base_loader import BaseBookLoader
+
+
+class MarkdownBookLoader(BaseBookLoader):
+    def __init__(
+        self,
+        md_name,
+        model,
+        key,
+        resume,
+        language,
+        model_api_base=None,
+        is_test=False,
+        test_num=5,
+        prompt_config=None,
+        single_translate=False,
+        context_flag=False,
+        context_paragraph_limit=0,
+        temperature=1.0,
+    ) -> None:
+        self.md_name = md_name
+        self.translate_model = model(
+            key,
+            language,
+            api_base=model_api_base,
+            temperature=temperature,
+            **prompt_config_to_kwargs(prompt_config),
+        )
+        self.is_test = is_test
+        self.p_to_save = []
+        self.bilingual_result = []
+        self.bilingual_temp_result = []
+        self.test_num = test_num
+        self.batch_size = 10
+        self.single_translate = single_translate
+        self.md_paragraphs = []
+
+        try:
+            with open(f"{md_name}", encoding="utf-8") as f:
+                self.origin_book = f.read().splitlines()
+
+        except Exception as e:
+            raise Exception("can not load file") from e
+
+        self.resume = resume
+        self.bin_path = f"{Path(md_name).parent}/.{Path(md_name).stem}.temp.bin"
+        if self.resume:
+            self.load_state()
+
+        self.process_markdown_content()
+
+    def process_markdown_content(self):
+        """将原始内容处理成 markdown 段落"""
+        current_paragraph = []
+        for line in self.origin_book:
+            # 如果是空行且当前段落不为空，保存当前段落
+            if not line.strip() and current_paragraph:
+                self.md_paragraphs.append('\n'.join(current_paragraph))
+                current_paragraph = []
+            # 如果是标题行，单独作为一个段落
+            elif line.strip().startswith('#'):
+                if current_paragraph:
+                    self.md_paragraphs.append('\n'.join(current_paragraph))
+                    current_paragraph = []
+                self.md_paragraphs.append(line)
+            # 其他情况，添加到当前段落
+            else:
+                current_paragraph.append(line)
+
+        # 处理最后一个段落
+        if current_paragraph:
+            self.md_paragraphs.append('\n'.join(current_paragraph))
+
+    @staticmethod
+    def _is_special_text(text):
+        return text.isdigit() or text.isspace() or len(text) == 0
+
+    def _make_new_book(self, book):
+        pass
+
+    def make_bilingual_book(self):
+        index = 0
+        p_to_save_len = len(self.p_to_save)
+
+        try:
+            sliced_list = [
+                self.md_paragraphs[i : i + self.batch_size]
+                for i in range(0, len(self.md_paragraphs), self.batch_size)
+            ]
+            for paragraphs in sliced_list:
+                batch_text = '\n\n'.join(paragraphs)
+                if self._is_special_text(batch_text):
+                    continue
+                if not self.resume or index >= p_to_save_len:
+                    try:
+                        max_retries = 3
+                        retry_count = 0
+                        while retry_count < max_retries:
+                            try:
+                                temp = self.translate_model.translate(batch_text)
+                                break
+                            except AttributeError as ae:
+                                print(f"翻译出错: {ae}")
+                                retry_count += 1
+                                if retry_count == max_retries:
+                                    raise Exception("翻译模型初始化失败") from ae
+                    except Exception as e:
+                        print(f"翻译过程中出错: {e}")
+                        raise Exception("翻译过程中出现错误") from e
+
+                    self.p_to_save.append(temp)
+                    if not self.single_translate:
+                        self.bilingual_result.append(batch_text)
+                    self.bilingual_result.append(temp)
+                index += self.batch_size
+                if self.is_test and index > self.test_num:
+                    break
+
+            self.save_file(
+                f"{Path(self.md_name).parent}/{Path(self.md_name).stem}_bilingual.md",
+                self.bilingual_result,
+            )
+
+        except (KeyboardInterrupt, Exception) as e:
+            print(f"发生错误: {e}")
+            print("程序将保存进度，您可以稍后继续")
+            self._save_progress()
+            self._save_temp_book()
+            sys.exit(1)  # 使用非零退出码表示错误
+
+    def _save_temp_book(self):
+        index = 0
+        sliced_list = [
+            self.origin_book[i : i + self.batch_size]
+            for i in range(0, len(self.origin_book), self.batch_size)
+        ]
+
+        for i in range(len(sliced_list)):
+            batch_text = "".join(sliced_list[i])
+            self.bilingual_temp_result.append(batch_text)
+            if self._is_special_text(self.origin_book[i]):
+                continue
+            if index < len(self.p_to_save):
+                self.bilingual_temp_result.append(self.p_to_save[index])
+            index += 1
+
+        self.save_file(
+            f"{Path(self.md_name).parent}/{Path(self.md_name).stem}_bilingual_temp.txt",
+            self.bilingual_temp_result,
+        )
+
+    def _save_progress(self):
+        try:
+            with open(self.bin_path, "w", encoding="utf-8") as f:
+                f.write("\n".join(self.p_to_save))
+        except:
+            raise Exception("can not save resume file")
+
+    def load_state(self):
+        try:
+            with open(self.bin_path, encoding="utf-8") as f:
+                self.p_to_save = f.read().splitlines()
+        except Exception as e:
+            raise Exception("can not load resume file") from e
+
+    def save_file(self, book_path, content):
+        try:
+            with open(book_path, "w", encoding="utf-8") as f:
+                f.write("\n".join(content))
+        except:
+            raise Exception("can not save file")
diff --git a/book_maker/translator/gemini_translator.py b/book_maker/translator/gemini_translator.py
@@ -43,6 +43,7 @@
     "gemini-1.5-flash-latest",
     "gemini-1.5-flash-001",
     "gemini-1.5-flash-002",
+    "gemini-2.0-flash-exp",
 ]
 
 
@@ -75,7 +76,7 @@ def __init__(
             or environ.get(PROMPT_ENV_MAP["system"])
             or None  # Allow None, but not empty string
         )
-
+        self.interval = 3
         genai.configure(api_key=next(self.keys))
         generation_config["temperature"] = temperature
 
@@ -119,6 +120,13 @@ def translate(self, text):
                     self.prompt.format(text=text, language=self.language)
                 )
                 t_text = self.convo.last.text.strip()
+                # 检查是否包含特定标签,如果有则只返回标签内的内容
+                tag_pattern = r'<step3_refined_translation>(.*?)</step3_refined_translation>'
+                tag_match = re.search(tag_pattern, t_text, re.DOTALL)
+                if tag_match:
+                    print("[bold green]" + re.sub("\n{3,}", "\n\n", t_text) + "[/bold green]")
+                    t_text = tag_match.group(1).strip()
+                    #print("[bold green]" + re.sub("\n{3,}", "\n\n", t_text) + "[/bold green]")
                 break
             except StopCandidateException as e:
                 print(

diff --git a/prompt_md.json b/prompt_md.json
@@ -0,0 +1,4 @@
+{
+  "system": "You are a highly skilled translator responsible for translating the content of books in Markdown format from English into Chinese.",
+  "user": "## Strategies\nYou will follow a three-step translation process:\n### 1. Translate the input content from English into Chinese, respect the intention of the original text, keep the original Markdown format unchanged, and do not delete or omit any content, nor add additional explanations or remarks.\n### 2. Read the original text and the translation carefully, and then put forward constructive criticism and helpful suggestions to improve the translation. The final style and tone of the translation should conform to the Chinese language style.\nYou must strictly follow the rules below.\n- Never change the Markdown markup structure. Don't add or remove links. Do not change any URL.\n- Never touch or change the contents of code blocks even if they appear to have a bug.\n- Always preserve the original line breaks. Do not add or remove blank lines.\n- Never touch any permalink at the end of each heading.\n- Never touch HTML-like tags such as `<Notes>`.\nWhen writing suggestions, pay attention to whether there are ways to improve the translation in terms of:\n- Accuracy (by correcting errors such as additions, mistranslations, omissions or untranslated text).\n- Fluency (by applying the rules of Chinese grammar, spelling and punctuation, and ensuring there is no unnecessary repetition).\n- Conciseness and abbreviation (please appropriately simplify and abbreviate the translation result while keeping the original meaning unchanged to avoid the translation being too lengthy).\n### 3. Based on the results of steps 1 and 2, refine and polish the translation, and do not add additional explanations or remarks.\n## Output\nFor each step of the translation process, output the results within the appropriate XML tags:\n<step1_initial_translation>\n[Insert your initial translation here.]\n</step1_initial_translation>\n<step2_reflection>\n[Insert your reflection on the translation and put forward specific here, useful and constructive suggestions to improve the translation. Each suggestion should target a specific part of the translation.]\n</step2_reflection>\n<step3_refined_translation>\n[Insert your refined and polished translation here.]\n</step3_refined_translation>\n## Input\nThe following is the content of the book that needs to be translated within the <INPUT> tag:\n<INPUT>{text}</INPUT>"
+}
diff --git a/prompt_template_sample.json b/prompt_template_sample.json
@@ -1,4 +1,4 @@
 {
-  "system": "You are a professional translator.", 
-  "user": "Translate the given text to {language}. Be faithful or accurate in translation. Make the translation readable or intelligible. Be elegant or natural in translation. If the text cannot be translated, return the original text as is. Do not translate person's name. Do not add any additional text in the translation. The text to be translated is:\n{text}"
+  "system": "You are a highly skilled academic translator. Please complete the translation task according to the following instructions and provide only the final polished translation.",
+  "user": "## Strategies\nYou will follow a three-step translation process:\n### Step.1 Initial Direct Translation: Translate the content from English to Chinese sentence by sentence, respecting the original intent without deleting, omitting, or adding any extra explanations or notes.\n ### Step.2 Reflection and Revision: Carefully review both the input content and the initial direct translation from Step 1. Check if the translation conveys the original meaning, if the grammatical structure is correct, if word choices are appropriate, and if there are any ambiguities or polysemous words. The final style and tone should conform to Chinese language conventions. \nYou must strictly follow the rules below.\n- Don't add or remove links. Do not change any URL.\n- Do not translate the reference list.\n- Never touch,change or translate the mathematical formulas.\n- Never touch,change or translate the contents of code blocks even if they appear to have a bug.\n- Always preserve the original line breaks. Do not add or remove blank lines.\nProvide constructive criticism and helpful suggestions to improve: \n- translation accuracy (correct additions, mistranslations, omissions, or untranslated text errors),\n- fluency (apply Chinese grammar, spelling, and punctuation rules, and ensure no unnecessary repetition), \n- conciseness (streamline the translation results while maintaining the original meaning, avoiding wordiness).\n ### Step.3 Polish and Optimize: Based on the results from Steps 1 and 2, refine and polish the translation, ensuring the final translation adheres to Chinese style without additional explanations or notes. The content to be translated is wrapped in the following <INPUT> tags:\n\n<INPUT>{text}</INPUT>. \n\nPlease write and output only the final polished translation here: "
 }