Skip to content

Commit

Permalink
Fix FAQ docx ui upload parsing support and reformat docx.py (#3645)
Browse files Browse the repository at this point in the history
* Fix FAQ docx ui upload parsing support and reformat docx.py

* Add more comments

* Optimize concatenate texts
  • Loading branch information
w5688414 authored Nov 2, 2022
1 parent 81bb7d1 commit ff546f6
Show file tree
Hide file tree
Showing 5 changed files with 118 additions and 51 deletions.
11 changes: 10 additions & 1 deletion pipelines/examples/FAQ/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -162,12 +162,21 @@ sh examples/frequently-asked-question/run_faq_web.sh

#### 3.4.5 数据更新

数据更新使用前面的 `utils/offline_ann.py`进行数据更新,示例数据如下(demo.txt):
数据更新有两种,第一种是使用界面的文件上传,支持txt,word,必须是Question和Answer两列,用\t进行分隔,另外word格式的数据,数据间用空行分隔开,txt格式按正常回车键分隔即可。第二种是使用前面的 `utils/offline_ann.py`进行数据更新,示例数据如下(demo.txt):

```
我想买保险,可以买哪些? 人身保障的保险,主要可以分为四大险种——即意外险、重疾险、医疗险和寿险。意外险——像过马路被车撞、被开水烫伤等等意外,意外险皆可赔付。医疗险——花多少钱报销多少钱,一般建议买百万医疗险。重疾险——得了重疾,按比例一次性赔付你约定保额。寿险——身故即赔。
选保险产品时,保险公司很重要吗? 重要,但不是第一重要,也不是最重要。产品应该是优先于公司的,毕竟产品的保障才是最直接和我们的利益挂钩的。在保险产品的保障差不多的情况下,知名度更高的保险公司会更好。
```
word示例数据:

```
我想买保险,可以买哪些? 可以买哪些?人身保障的保险,主要可以分为四大险种——即意外险、重疾险、医疗险和寿险。意外险——像过马路被车撞、被开水烫伤等等意外,意外险皆可赔付。医疗险——花多少钱报销多少钱,一般建议买百万医疗险。重疾险——得了重疾,按比例一次性赔付你约定保额。寿险——身故即赔。
选保险产品时,保险公司很重要吗? 重要,但不是第一重要,也不是最重要。产品应该是优先于公司的,毕竟产品的保障才是最直接和我们的利益挂钩的。在保险产品的保障差不多的情况下,知名度更高的保险公司会更好。
```


如果安装遇见问题可以查看[FAQ文档](../../FAQ.md)

Expand Down
81 changes: 48 additions & 33 deletions pipelines/pipelines/nodes/file_converter/docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ def convert(
# This part will parse the docs files with images, the text and the following images will be added as an document
for i in range(len(file.paragraphs)):
paragraph = file.paragraphs[i]
# Extracting images from the paragraph
image_list = self.get_image_list(file, paragraph)
# Extracting text from the paragraph
# If there is text, Adding the text to text_dict
if (paragraph.text != ""):
Expand All @@ -116,44 +118,57 @@ def convert(
text_dict = {'text': [text], 'images': []}
else:
text_dict['text'].append(text)
# Extracting images from the paragraph
image_list = self.get_image_list(file, paragraph)
# If there are not text and images, adding text_dict to documents
if (image_list is None and bool(text_dict)):
raw_text = ''.join(text_dict['text'])
# If the extracted text is "", skip it
if (raw_text == ''):
continue
meta_data = {}
if (meta is not None and 'name' in meta):
meta_data['name'] = meta['name']
meta_data['images'] = text_dict['images']
document = {
"content": raw_text,
"content_type": "text",
"meta": meta_data
}
documents.append(document)
if (image_list is not None):
image_names = self.save_images(image_list)
text_dict['images'] += image_names
else:
# If there are not text and images, adding text_dict to documents
if (image_list is None and bool(text_dict)):
raw_text = ''.join(text_dict['text'])
# If the extracted text is "", skip it
if (raw_text == ''):
continue
meta_data = {}
if (meta is not None and 'name' in meta):
meta_data['name'] = meta['name']
meta_data['images'] = text_dict['images']
document = {
"content": raw_text,
"content_type": "text",
"meta": meta_data
}
documents.append(document)

text = paragraph.text
text_dict = {'text': [text], 'images': []}
elif (image_list is not None):
for i, image in enumerate(image_list):
if image:
# File extension & file content
ext, blob = image.ext, image.blob
# Using md5 to generate image name and save image into desc_path
md5hash = hashlib.md5(blob)
md5_name = md5hash.hexdigest()
image_name = '{}_{}.{}'.format(md5_name, i, ext)
image_path = os.path.join(self.desc_path, image_name)
Image.open(BytesIO(blob)).save(image_path)
# Adding image_name into the text_dict as the image for the text
text_dict['images'].append(image_name)
text = paragraph.text
text_dict = {'text': [text], 'images': []}
elif (image_list is not None):
image_names = self.save_images(image_list)
text_dict['images'] += image_names
else:
continue
return documents

def save_images(self, image_list):
"""
Save the parsed image into desc_path
:param image_list: image files from the docx file
"""
image_names = []
for i, image in enumerate(image_list):
if image:
# File extension & file content
ext, blob = image.ext, image.blob
# Using md5 to generate image name and save image into desc_path
md5hash = hashlib.md5(blob)
md5_name = md5hash.hexdigest()
image_name = '{}_{}.{}'.format(md5_name, i, ext)
image_path = os.path.join(self.desc_path, image_name)
Image.open(BytesIO(blob)).save(image_path)
# Adding image_name into the text_dict as the image for the text
image_names.append(image_name)

return image_names

def get_image_list(self, document: Document, paragraph: Paragraph):
"""
Extract images from paragraph and document object.
Expand Down
45 changes: 36 additions & 9 deletions pipelines/pipelines/nodes/preprocessor/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def __init__(
split_by: str = "word",
split_length: int = 200,
split_overlap: int = 0,
split_answers: bool = False,
split_respect_sentence_boundary: bool = True,
language: str = "en",
):
Expand Down Expand Up @@ -93,6 +94,7 @@ def __init__(
split_by=split_by,
split_length=split_length,
split_overlap=split_overlap,
split_answers=split_answers,
split_respect_sentence_boundary=split_respect_sentence_boundary,
)

Expand All @@ -110,6 +112,7 @@ def __init__(
self.split_respect_sentence_boundary = split_respect_sentence_boundary
self.language = iso639_to_nltk.get(language, language)
self.print_log: Set[str] = set()
self.split_answers = split_answers

def process(
self,
Expand Down Expand Up @@ -160,6 +163,7 @@ def _process_single(
split_by: Optional[str] = None,
split_length: Optional[int] = None,
split_overlap: Optional[int] = None,
split_answers: Optional[int] = None,
split_respect_sentence_boundary: Optional[bool] = None,
) -> List[dict]:

Expand All @@ -177,6 +181,8 @@ def _process_single(
split_overlap = self.split_overlap
if split_respect_sentence_boundary is None:
split_respect_sentence_boundary = self.split_respect_sentence_boundary
if (split_answers is None):
split_answers = self.split_answers

cleaned_document = self.clean(
document=document,
Expand All @@ -189,6 +195,7 @@ def _process_single(
split_by=split_by,
split_length=split_length,
split_overlap=split_overlap,
split_answers=split_answers,
split_respect_sentence_boundary=split_respect_sentence_boundary,
)
return split_documents
Expand Down Expand Up @@ -240,6 +247,7 @@ def split(
split_by: str,
split_length: int,
split_overlap: int,
split_answers: bool,
split_respect_sentence_boundary: bool,
) -> List[dict]:
"""Perform document splitting on a single document. This method can split on different units, at different lengths,
Expand Down Expand Up @@ -303,7 +311,10 @@ def split(
text_splits.append(txt)
else:
# create individual "elements" of passage, sentence, or word
if split_by == "passage":
# Faq text need to split text by '\n' of a passage
if split_answers and split_by == "passage":
text_splits = text.split("\n")
elif split_by == "passage":
elements = text.split("\n\n")
elif split_by == "sentence":
elements = nltk.tokenize.sent_tokenize(text,
Expand All @@ -316,25 +327,41 @@ def split(
)

# concatenate individual elements based on split_length & split_stride
if split_overlap:
# FAQ text process don't need split text into fix lengths
if (not split_answers):
segments = windowed(elements,
n=split_length,
step=split_length - split_overlap)
else:
segments = windowed(elements, n=split_length, step=split_length)
text_splits = []
for seg in segments:
txt = " ".join([t for t in seg if t is not None])
if len(txt) > 0:
text_splits.append(txt)

text_splits = []
for seg in segments:
txt = " ".join([t for t in seg if t is not None])
if len(txt) > 0:
text_splits.append(txt)
# create new document dicts for each text split
documents = []
for i, txt in enumerate(text_splits):
doc = deepcopy(document)
doc["content"] = txt

if "meta" not in doc.keys() or doc["meta"] is None:
doc["meta"] = {}
if (split_answers):
text_arr = doc["content"].split('\t')
if (len(text_arr) > 2):
raise Exception(
"Each line text must be two columns and separated by \t"
)
# Maybe empty lines
if (len(text_arr) == 1):
logger.info(
'Some lines in your text cannot parse into question and text, maybe empty lines'
)
continue
else:
query, answer = text_arr
doc["content"] = query
doc["meta"]["answer"] = answer
doc["meta"]["_split_id"] = i
documents.append(doc)

Expand Down
5 changes: 3 additions & 2 deletions pipelines/rest_api/pipeline/dense_faq.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,9 @@ components: # define all the building-blocks for Pipeline
- name: Preprocessor
type: PreProcessor
params:
split_by: word
split_length: 1000
split_by: passage
split_respect_sentence_boundary: False
split_answers: True
- name: FileTypeClassifier
type: FileTypeClassifier

Expand Down
27 changes: 21 additions & 6 deletions pipelines/ui/webapp_faq.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,20 @@ def on_change_text():
st.session_state.raw_json = None


def upload():
data_files = st.session_state.upload_files['files']
for data_file in data_files:
# Upload file
if data_file and data_file.name not in st.session_state.upload_files[
'uploaded_files']:
raw_json = upload_doc(data_file)
st.session_state.upload_files['uploaded_files'].append(
data_file.name)
# Save the uploaded files
st.session_state.upload_files['uploaded_files'] = list(
set(st.session_state.upload_files['uploaded_files']))


def main():

st.set_page_config(
Expand All @@ -68,6 +82,7 @@ def main():
set_state_if_absent("results", None)
set_state_if_absent("raw_json", None)
set_state_if_absent("random_question_requested", False)
set_state_if_absent("upload_files", {'uploaded_files': [], 'files': []})

# Small callback to reset the interface in case the text of the question changes
def reset_results(*args):
Expand Down Expand Up @@ -101,13 +116,13 @@ def reset_results(*args):
data_files = st.sidebar.file_uploader(
"",
type=["pdf", "txt", "docx", "png"],
help="文件上传",
help="选择多个文件",
accept_multiple_files=True)
for data_file in data_files:
# Upload file
if data_file:
raw_json = upload_doc(data_file)
st.sidebar.write(str(data_file.name) + "    ✅ ")
st.session_state.upload_files['files'] = data_files
st.sidebar.button("文件上传", on_click=upload)
for data_file in st.session_state.upload_files['uploaded_files']:
st.sidebar.write(str(data_file) + "    ✅ ")

hs_version = ""
try:
hs_version = f" <small>(v{pipelines_version()})</small>"
Expand Down

0 comments on commit ff546f6

Please sign in to comment.