From 77bab7bf4c24b783da508cb582f02a5cbd81b8b9 Mon Sep 17 00:00:00 2001 From: zengzh Date: Thu, 2 Mar 2023 15:24:11 +0800 Subject: [PATCH 1/4] add batch func --- make.py | 56 ++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/make.py b/make.py index c936dc82..db3fb532 100644 --- a/make.py +++ b/make.py @@ -9,6 +9,7 @@ from bs4 import BeautifulSoup as bs from ebooklib import epub from rich import print +import multiprocessing as mp NO_LIMIT = False IS_TEST = False @@ -116,7 +117,6 @@ def translate(self, text): class BEPUB: def __init__(self, epub_name, model, key): self.epub_name = epub_name - self.new_epub = epub.EpubBook() self.translate_model = model(key) self.origin_book = epub.read_epub(self.epub_name) @@ -125,29 +125,41 @@ def make_bilingual_book(self): new_book.metadata = self.origin_book.metadata new_book.spine = self.origin_book.spine new_book.toc = self.origin_book.toc - all_items = list(self.origin_book.get_items()) - # we just translate tag p - all_p_length = sum( - [len(bs(i.content, "html.parser").findAll("p")) for i in all_items] - ) - print("TODO need process bar here: " + str(all_p_length)) + + # we just translate tag + with mp.Pool() as pool: + translated_p_list = pool.map( + lambda x: (x[0], self.translate_model.translate(x[1])), + [(i, p.string) for i in self.origin_book.get_items() + if i.get_type() == 9 + for p in bs(i.content, "html.parser").findAll("p") + if p.string and not p.string.isdigit()] + ) + print("TODO need process bar here: " + len(translated_p_list)) + + # Update the "p" tags with their translations index = 0 - for i in self.origin_book.get_items(): - if i.get_type() == 9: - soup = bs(i.content, "html.parser") - p_list = soup.findAll("p") - is_test_done = IS_TEST and index > 20 - for p in p_list: - if not is_test_done: - if p.string and not p.string.isdigit(): - new_p = copy(p) - # TODO banch of p to translate then combine - # PR welcome here - new_p.string = self.translate_model.translate(p.string) - p.insert_after(new_p) - index += 1 - i.content = soup.prettify().encode() + for i, translated_p in translated_p_list: + soup = bs(i.content, "html.parser") + p_list = soup.findAll("p") + for j, p in enumerate(p_list): + if p.string == translated_p[j][0]: + new_p = copy(p) + new_p.string = translated_p[j][1] + p.insert_after(new_p) + index += 1 + if IS_TEST and index > 20: + break + i.content = soup.prettify().encode() new_book.add_item(i) + if IS_TEST and index > 20: + break + + # Add remaining items to the new book + for i in self.origin_book.get_items(): + if i.get_type() != 9: + new_book.add_item(i) + name = self.epub_name.split(".")[0] epub.write_epub(f"{name}_bilingual.epub", new_book, {}) From f3bea71cccbf2ecc27301f9fdbba84d2c238a3a1 Mon Sep 17 00:00:00 2001 From: zengzzzzz Date: Thu, 2 Mar 2023 16:30:53 +0800 Subject: [PATCH 2/4] fix the batch func in order --- make.py | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/make.py b/make.py index db3fb532..294c16e7 100644 --- a/make.py +++ b/make.py @@ -119,6 +119,51 @@ def __init__(self, epub_name, model, key): self.epub_name = epub_name self.translate_model = model(key) self.origin_book = epub.read_epub(self.epub_name) + self.test_limit_index = 0 + + def translate_p(self, p): + if p.string and not p.string.isdigit(): + new_p = copy(p) + new_p.string = self.translate_model.translate(p.string) + p.insert_after(new_p) + + def translate_item(self, item): + soup = bs(item.content, "html.parser") + p_list = soup.findAll("p") + is_test_done = IS_TEST and self.test_limit_index > 20 + for p in p_list: + if not is_test_done: + self.translate_p(p) + self.test_limit_index += 1 + item.content = soup.prettify().encode() + return item + + def make_bilingual_book(self): + new_book = epub.EpubBook() + new_book.metadata = self.origin_book.metadata + new_book.spine = self.origin_book.spine + new_book.toc = self.origin_book.toc + all_items = list(self.origin_book.get_items()) + # we just translate tag p + all_p_length = sum( + [len(bs(i.content, "html.parser").findAll("p")) for i in all_items] + ) + print("TODO need process bar here: " + str(all_p_length)) + pool = mp.Pool() + processed_items = pool.map(self.translate_item, all_items) + pool.close() + pool.join() + for item in processed_items: + new_book.add_item(item) + + for item in new_book.spine: + new_item = new_book.get_item_with_href(item.href) + new_book.add_item(new_item) + new_book.spine.remove(item) + new_book.spine.append(new_item) + + name = self.epub_name.split(".")[0] + epub.write_epub(f"{name}_bilingual.epub", new_book, {}) def make_bilingual_book(self): new_book = epub.EpubBook() @@ -126,6 +171,7 @@ def make_bilingual_book(self): new_book.spine = self.origin_book.spine new_book.toc = self.origin_book.toc + self.origin_book.spine # we just translate tag with mp.Pool() as pool: translated_p_list = pool.map( From ed4065b16c0501a97d09d7e5f168755cdcc358d0 Mon Sep 17 00:00:00 2001 From: zengzzzzz Date: Thu, 2 Mar 2023 16:33:15 +0800 Subject: [PATCH 3/4] del func --- make.py | 48 ++---------------------------------------------- 1 file changed, 2 insertions(+), 46 deletions(-) diff --git a/make.py b/make.py index 294c16e7..68eb179b 100644 --- a/make.py +++ b/make.py @@ -150,12 +150,13 @@ def make_bilingual_book(self): ) print("TODO need process bar here: " + str(all_p_length)) pool = mp.Pool() + # use mul pool to translate processed_items = pool.map(self.translate_item, all_items) pool.close() pool.join() for item in processed_items: new_book.add_item(item) - + # in order to make epub valid for item in new_book.spine: new_item = new_book.get_item_with_href(item.href) new_book.add_item(new_item) @@ -165,51 +166,6 @@ def make_bilingual_book(self): name = self.epub_name.split(".")[0] epub.write_epub(f"{name}_bilingual.epub", new_book, {}) - def make_bilingual_book(self): - new_book = epub.EpubBook() - new_book.metadata = self.origin_book.metadata - new_book.spine = self.origin_book.spine - new_book.toc = self.origin_book.toc - - self.origin_book.spine - # we just translate tag - with mp.Pool() as pool: - translated_p_list = pool.map( - lambda x: (x[0], self.translate_model.translate(x[1])), - [(i, p.string) for i in self.origin_book.get_items() - if i.get_type() == 9 - for p in bs(i.content, "html.parser").findAll("p") - if p.string and not p.string.isdigit()] - ) - print("TODO need process bar here: " + len(translated_p_list)) - - # Update the "p" tags with their translations - index = 0 - for i, translated_p in translated_p_list: - soup = bs(i.content, "html.parser") - p_list = soup.findAll("p") - for j, p in enumerate(p_list): - if p.string == translated_p[j][0]: - new_p = copy(p) - new_p.string = translated_p[j][1] - p.insert_after(new_p) - index += 1 - if IS_TEST and index > 20: - break - i.content = soup.prettify().encode() - new_book.add_item(i) - if IS_TEST and index > 20: - break - - # Add remaining items to the new book - for i in self.origin_book.get_items(): - if i.get_type() != 9: - new_book.add_item(i) - - name = self.epub_name.split(".")[0] - epub.write_epub(f"{name}_bilingual.epub", new_book, {}) - - if __name__ == "__main__": MODEL_DICT = {"gpt3": GPT3, "chatgpt": ChatGPT} parser = argparse.ArgumentParser() From 6ee2a6fca70f88cb5aa6044985e9d9f8239c6e7b Mon Sep 17 00:00:00 2001 From: zengzh Date: Thu, 2 Mar 2023 13:03:21 +0000 Subject: [PATCH 4/4] fix the href error --- make.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/make.py b/make.py index e03ca0f1..4f08c6c1 100644 --- a/make.py +++ b/make.py @@ -154,15 +154,11 @@ def make_bilingual_book(self): processed_items = pool.map(self.translate_item, all_items) pool.close() pool.join() - for item in processed_items: - new_book.add_item(item) # in order to make epub valid + item_map = { item.id: item for item in processed_items} for item in new_book.spine: - new_item = new_book.get_item_with_href(item.href) + new_item = item_map[item[0]] new_book.add_item(new_item) - new_book.spine.remove(item) - new_book.spine.append(new_item) - name = self.epub_name.split(".")[0] epub.write_epub(f"{name}_bilingual.epub", new_book, {})