Fix FAQ docx ui upload parsing support and reformat docx.py (#3645)

* Fix FAQ docx ui upload parsing support and reformat docx.py * Add more comments * Optimize concatenate texts
PaddlePaddle · Nov 2, 2022 · ff546f6 · ff546f6
1 parent 81bb7d1
commit ff546f6
Show file tree

Hide file tree

Showing 5 changed files with 118 additions and 51 deletions.
diff --git a/pipelines/examples/FAQ/README.md b/pipelines/examples/FAQ/README.md
@@ -162,12 +162,21 @@ sh examples/frequently-asked-question/run_faq_web.sh
 
 #### 3.4.5 数据更新
 
-数据更新使用前面的 `utils/offline_ann.py`进行数据更新，示例数据如下(demo.txt)：
+数据更新有两种，第一种是使用界面的文件上传，支持txt，word，必须是Question和Answer两列，用\t进行分隔，另外word格式的数据，数据间用空行分隔开，txt格式按正常回车键分隔即可。第二种是使用前面的 `utils/offline_ann.py`进行数据更新，示例数据如下(demo.txt)：
 
 ```
 我想买保险，可以买哪些？    人身保障的保险，主要可以分为四大险种——即意外险、重疾险、医疗险和寿险。意外险——像过马路被车撞、被开水烫伤等等意外，意外险皆可赔付。医疗险——花多少钱报销多少钱，一般建议买百万医疗险。重疾险——得了重疾，按比例一次性赔付你约定保额。寿险——身故即赔。
 选保险产品时，保险公司很重要吗？    重要，但不是第一重要，也不是最重要。产品应该是优先于公司的，毕竟产品的保障才是最直接和我们的利益挂钩的。在保险产品的保障差不多的情况下，知名度更高的保险公司会更好。
 ```
+word示例数据：
+
+```
+我想买保险，可以买哪些？	可以买哪些？人身保障的保险，主要可以分为四大险种——即意外险、重疾险、医疗险和寿险。意外险——像过马路被车撞、被开水烫伤等等意外，意外险皆可赔付。医疗险——花多少钱报销多少钱，一般建议买百万医疗险。重疾险——得了重疾，按比例一次性赔付你约定保额。寿险——身故即赔。
+
+选保险产品时，保险公司很重要吗？	重要，但不是第一重要，也不是最重要。产品应该是优先于公司的，毕竟产品的保障才是最直接和我们的利益挂钩的。在保险产品的保障差不多的情况下，知名度更高的保险公司会更好。
+
+```
+
 
 如果安装遇见问题可以查看[FAQ文档](../../FAQ.md)
 

diff --git a/pipelines/pipelines/nodes/file_converter/docx.py b/pipelines/pipelines/nodes/file_converter/docx.py
@@ -108,6 +108,8 @@ def convert(
         # This part will parse the docs files with images, the text and the following images will be added as an document
         for i in range(len(file.paragraphs)):
             paragraph = file.paragraphs[i]
+            # Extracting images from the paragraph
+            image_list = self.get_image_list(file, paragraph)
             # Extracting text from the paragraph
             # If there is text, Adding the text to text_dict
             if (paragraph.text != ""):
@@ -116,44 +118,57 @@ def convert(
                     text_dict = {'text': [text], 'images': []}
                 else:
                     text_dict['text'].append(text)
-            # Extracting images from the paragraph
-            image_list = self.get_image_list(file, paragraph)
-            # If there are not text and images, adding text_dict to documents
-            if (image_list is None and bool(text_dict)):
-                raw_text = ''.join(text_dict['text'])
-                # If the extracted text is "", skip it
-                if (raw_text == ''):
-                    continue
-                meta_data = {}
-                if (meta is not None and 'name' in meta):
-                    meta_data['name'] = meta['name']
-                meta_data['images'] = text_dict['images']
-                document = {
-                    "content": raw_text,
-                    "content_type": "text",
-                    "meta": meta_data
-                }
-                documents.append(document)
+                if (image_list is not None):
+                    image_names = self.save_images(image_list)
+                    text_dict['images'] += image_names
+            else:
+                # If there are not text and images, adding text_dict to documents
+                if (image_list is None and bool(text_dict)):
+                    raw_text = ''.join(text_dict['text'])
+                    # If the extracted text is "", skip it
+                    if (raw_text == ''):
+                        continue
+                    meta_data = {}
+                    if (meta is not None and 'name' in meta):
+                        meta_data['name'] = meta['name']
+                    meta_data['images'] = text_dict['images']
+                    document = {
+                        "content": raw_text,
+                        "content_type": "text",
+                        "meta": meta_data
+                    }
+                    documents.append(document)
 
-                text = paragraph.text
-                text_dict = {'text': [text], 'images': []}
-            elif (image_list is not None):
-                for i, image in enumerate(image_list):
-                    if image:
-                        # File extension & file content
-                        ext, blob = image.ext, image.blob
-                        # Using md5 to generate image name and save image into desc_path
-                        md5hash = hashlib.md5(blob)
-                        md5_name = md5hash.hexdigest()
-                        image_name = '{}_{}.{}'.format(md5_name, i, ext)
-                        image_path = os.path.join(self.desc_path, image_name)
-                        Image.open(BytesIO(blob)).save(image_path)
-                        # Adding image_name into the text_dict as the image for the text
-                        text_dict['images'].append(image_name)
+                    text = paragraph.text
+                    text_dict = {'text': [text], 'images': []}
+                elif (image_list is not None):
+                    image_names = self.save_images(image_list)
+                    text_dict['images'] += image_names
                 else:
                     continue
         return documents
 
+    def save_images(self, image_list):
+        """
+        Save the parsed image into desc_path
+        :param image_list: image files from the docx file
+        """
+        image_names = []
+        for i, image in enumerate(image_list):
+            if image:
+                # File extension & file content
+                ext, blob = image.ext, image.blob
+                # Using md5 to generate image name and save image into desc_path
+                md5hash = hashlib.md5(blob)
+                md5_name = md5hash.hexdigest()
+                image_name = '{}_{}.{}'.format(md5_name, i, ext)
+                image_path = os.path.join(self.desc_path, image_name)
+                Image.open(BytesIO(blob)).save(image_path)
+                # Adding image_name into the text_dict as the image for the text
+                image_names.append(image_name)
+
+        return image_names
+
     def get_image_list(self, document: Document, paragraph: Paragraph):
         """
         Extract images from  paragraph and document object.

diff --git a/pipelines/pipelines/nodes/preprocessor/preprocessor.py b/pipelines/pipelines/nodes/preprocessor/preprocessor.py
@@ -60,6 +60,7 @@ def __init__(
         split_by: str = "word",
         split_length: int = 200,
         split_overlap: int = 0,
+        split_answers: bool = False,
         split_respect_sentence_boundary: bool = True,
         language: str = "en",
     ):
@@ -93,6 +94,7 @@ def __init__(
             split_by=split_by,
             split_length=split_length,
             split_overlap=split_overlap,
+            split_answers=split_answers,
             split_respect_sentence_boundary=split_respect_sentence_boundary,
         )
 
@@ -110,6 +112,7 @@ def __init__(
         self.split_respect_sentence_boundary = split_respect_sentence_boundary
         self.language = iso639_to_nltk.get(language, language)
         self.print_log: Set[str] = set()
+        self.split_answers = split_answers
 
     def process(
         self,
@@ -160,6 +163,7 @@ def _process_single(
         split_by: Optional[str] = None,
         split_length: Optional[int] = None,
         split_overlap: Optional[int] = None,
+        split_answers: Optional[int] = None,
         split_respect_sentence_boundary: Optional[bool] = None,
     ) -> List[dict]:
 
@@ -177,6 +181,8 @@ def _process_single(
             split_overlap = self.split_overlap
         if split_respect_sentence_boundary is None:
             split_respect_sentence_boundary = self.split_respect_sentence_boundary
+        if (split_answers is None):
+            split_answers = self.split_answers
 
         cleaned_document = self.clean(
             document=document,
@@ -189,6 +195,7 @@ def _process_single(
             split_by=split_by,
             split_length=split_length,
             split_overlap=split_overlap,
+            split_answers=split_answers,
             split_respect_sentence_boundary=split_respect_sentence_boundary,
         )
         return split_documents
@@ -240,6 +247,7 @@ def split(
         split_by: str,
         split_length: int,
         split_overlap: int,
+        split_answers: bool,
         split_respect_sentence_boundary: bool,
     ) -> List[dict]:
         """Perform document splitting on a single document. This method can split on different units, at different lengths,
@@ -303,7 +311,10 @@ def split(
                     text_splits.append(txt)
         else:
             # create individual "elements" of passage, sentence, or word
-            if split_by == "passage":
+            # Faq text need to split text by '\n' of a passage
+            if split_answers and split_by == "passage":
+                text_splits = text.split("\n")
+            elif split_by == "passage":
                 elements = text.split("\n\n")
             elif split_by == "sentence":
                 elements = nltk.tokenize.sent_tokenize(text,
@@ -316,25 +327,41 @@ def split(
                 )
 
             # concatenate individual elements based on split_length & split_stride
-            if split_overlap:
+            # FAQ text process don't need split text into fix lengths
+            if (not split_answers):
                 segments = windowed(elements,
                                     n=split_length,
                                     step=split_length - split_overlap)
-            else:
-                segments = windowed(elements, n=split_length, step=split_length)
-            text_splits = []
-            for seg in segments:
-                txt = " ".join([t for t in seg if t is not None])
-                if len(txt) > 0:
-                    text_splits.append(txt)
 
+                text_splits = []
+                for seg in segments:
+                    txt = " ".join([t for t in seg if t is not None])
+                    if len(txt) > 0:
+                        text_splits.append(txt)
         # create new document dicts for each text split
         documents = []
         for i, txt in enumerate(text_splits):
             doc = deepcopy(document)
             doc["content"] = txt
+
             if "meta" not in doc.keys() or doc["meta"] is None:
                 doc["meta"] = {}
+            if (split_answers):
+                text_arr = doc["content"].split('\t')
+                if (len(text_arr) > 2):
+                    raise Exception(
+                        "Each line text must be two columns and separated by \t"
+                    )
+                # Maybe empty lines
+                if (len(text_arr) == 1):
+                    logger.info(
+                        'Some lines in your text cannot parse into question and text, maybe empty lines'
+                    )
+                    continue
+                else:
+                    query, answer = text_arr
+                doc["content"] = query
+                doc["meta"]["answer"] = answer
             doc["meta"]["_split_id"] = i
             documents.append(doc)
 

diff --git a/pipelines/rest_api/pipeline/dense_faq.yaml b/pipelines/rest_api/pipeline/dense_faq.yaml
@@ -32,8 +32,9 @@ components:    # define all the building-blocks for Pipeline
   - name: Preprocessor
     type: PreProcessor
     params:
-      split_by: word
-      split_length: 1000
+      split_by: passage
+      split_respect_sentence_boundary: False
+      split_answers: True
   - name: FileTypeClassifier
     type: FileTypeClassifier
 

diff --git a/pipelines/ui/webapp_faq.py b/pipelines/ui/webapp_faq.py
@@ -56,6 +56,20 @@ def on_change_text():
     st.session_state.raw_json = None
 
 
+def upload():
+    data_files = st.session_state.upload_files['files']
+    for data_file in data_files:
+        # Upload file
+        if data_file and data_file.name not in st.session_state.upload_files[
+                'uploaded_files']:
+            raw_json = upload_doc(data_file)
+            st.session_state.upload_files['uploaded_files'].append(
+                data_file.name)
+    # Save the uploaded files
+    st.session_state.upload_files['uploaded_files'] = list(
+        set(st.session_state.upload_files['uploaded_files']))
+
+
 def main():
 
     st.set_page_config(
@@ -68,6 +82,7 @@ def main():
     set_state_if_absent("results", None)
     set_state_if_absent("raw_json", None)
     set_state_if_absent("random_question_requested", False)
+    set_state_if_absent("upload_files", {'uploaded_files': [], 'files': []})
 
     # Small callback to reset the interface in case the text of the question changes
     def reset_results(*args):
@@ -101,13 +116,13 @@ def reset_results(*args):
         data_files = st.sidebar.file_uploader(
             "",
             type=["pdf", "txt", "docx", "png"],
-            help="文件上传",
+            help="选择多个文件",
             accept_multiple_files=True)
-        for data_file in data_files:
-            # Upload file
-            if data_file:
-                raw_json = upload_doc(data_file)
-                st.sidebar.write(str(data_file.name) + " &nbsp;&nbsp; ✅ ")
+        st.session_state.upload_files['files'] = data_files
+        st.sidebar.button("文件上传", on_click=upload)
+        for data_file in st.session_state.upload_files['uploaded_files']:
+            st.sidebar.write(str(data_file) + " &nbsp;&nbsp; ✅ ")
+
     hs_version = ""
     try:
         hs_version = f" <small>(v{pipelines_version()})</small>"