Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix FAQ docx ui upload parsing support and reformat docx.py #3645

Merged
merged 5 commits into from
Nov 2, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion pipelines/examples/FAQ/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -162,12 +162,21 @@ sh examples/frequently-asked-question/run_faq_web.sh

#### 3.4.5 数据更新

数据更新使用前面的 `utils/offline_ann.py`进行数据更新,示例数据如下(demo.txt):
数据更新有两种,第一种是使用界面的文件上传,支持txt,word,必须是Question和Answer两列,用\t进行分隔,另外word格式的数据,数据间用空行分隔开,txt格式按正常回车键分隔即可。第二种是使用前面的 `utils/offline_ann.py`进行数据更新,示例数据如下(demo.txt):

```
我想买保险,可以买哪些? 人身保障的保险,主要可以分为四大险种——即意外险、重疾险、医疗险和寿险。意外险——像过马路被车撞、被开水烫伤等等意外,意外险皆可赔付。医疗险——花多少钱报销多少钱,一般建议买百万医疗险。重疾险——得了重疾,按比例一次性赔付你约定保额。寿险——身故即赔。
选保险产品时,保险公司很重要吗? 重要,但不是第一重要,也不是最重要。产品应该是优先于公司的,毕竟产品的保障才是最直接和我们的利益挂钩的。在保险产品的保障差不多的情况下,知名度更高的保险公司会更好。
```
word示例数据:

```
我想买保险,可以买哪些? 可以买哪些?人身保障的保险,主要可以分为四大险种——即意外险、重疾险、医疗险和寿险。意外险——像过马路被车撞、被开水烫伤等等意外,意外险皆可赔付。医疗险——花多少钱报销多少钱,一般建议买百万医疗险。重疾险——得了重疾,按比例一次性赔付你约定保额。寿险——身故即赔。

选保险产品时,保险公司很重要吗? 重要,但不是第一重要,也不是最重要。产品应该是优先于公司的,毕竟产品的保障才是最直接和我们的利益挂钩的。在保险产品的保障差不多的情况下,知名度更高的保险公司会更好。

```


如果安装遇见问题可以查看[FAQ文档](../../FAQ.md)

Expand Down
81 changes: 48 additions & 33 deletions pipelines/pipelines/nodes/file_converter/docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ def convert(
# This part will parse the docs files with images, the text and the following images will be added as an document
for i in range(len(file.paragraphs)):
paragraph = file.paragraphs[i]
# Extracting images from the paragraph
image_list = self.get_image_list(file, paragraph)
# Extracting text from the paragraph
# If there is text, Adding the text to text_dict
if (paragraph.text != ""):
Expand All @@ -116,44 +118,57 @@ def convert(
text_dict = {'text': [text], 'images': []}
else:
text_dict['text'].append(text)
# Extracting images from the paragraph
image_list = self.get_image_list(file, paragraph)
# If there are not text and images, adding text_dict to documents
if (image_list is None and bool(text_dict)):
raw_text = ''.join(text_dict['text'])
# If the extracted text is "", skip it
if (raw_text == ''):
continue
meta_data = {}
if (meta is not None and 'name' in meta):
meta_data['name'] = meta['name']
meta_data['images'] = text_dict['images']
document = {
"content": raw_text,
"content_type": "text",
"meta": meta_data
}
documents.append(document)
if (image_list is not None):
image_names = self.save_images(image_list)
text_dict['images'] += image_names
else:
# If there are not text and images, adding text_dict to documents
if (image_list is None and bool(text_dict)):
raw_text = ''.join(text_dict['text'])
# If the extracted text is "", skip it
if (raw_text == ''):
continue
meta_data = {}
if (meta is not None and 'name' in meta):
meta_data['name'] = meta['name']
meta_data['images'] = text_dict['images']
document = {
"content": raw_text,
"content_type": "text",
"meta": meta_data
}
documents.append(document)

text = paragraph.text
text_dict = {'text': [text], 'images': []}
elif (image_list is not None):
for i, image in enumerate(image_list):
if image:
# File extension & file content
ext, blob = image.ext, image.blob
# Using md5 to generate image name and save image into desc_path
md5hash = hashlib.md5(blob)
md5_name = md5hash.hexdigest()
image_name = '{}_{}.{}'.format(md5_name, i, ext)
image_path = os.path.join(self.desc_path, image_name)
Image.open(BytesIO(blob)).save(image_path)
# Adding image_name into the text_dict as the image for the text
text_dict['images'].append(image_name)
text = paragraph.text
text_dict = {'text': [text], 'images': []}
elif (image_list is not None):
image_names = self.save_images(image_list)
text_dict['images'] += image_names
else:
continue
return documents

def save_images(self, image_list):
"""
Save the parsed image into desc_path
:param image_list: image files from the docx file
"""
image_names = []
for i, image in enumerate(image_list):
if image:
# File extension & file content
ext, blob = image.ext, image.blob
# Using md5 to generate image name and save image into desc_path
md5hash = hashlib.md5(blob)
md5_name = md5hash.hexdigest()
image_name = '{}_{}.{}'.format(md5_name, i, ext)
image_path = os.path.join(self.desc_path, image_name)
Image.open(BytesIO(blob)).save(image_path)
# Adding image_name into the text_dict as the image for the text
image_names.append(image_name)

return image_names

def get_image_list(self, document: Document, paragraph: Paragraph):
"""
Extract images from paragraph and document object.
Expand Down
57 changes: 44 additions & 13 deletions pipelines/pipelines/nodes/preprocessor/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def __init__(
split_by: str = "word",
split_length: int = 200,
split_overlap: int = 0,
split_answers: bool = False,
split_respect_sentence_boundary: bool = True,
language: str = "en",
):
Expand Down Expand Up @@ -93,6 +94,7 @@ def __init__(
split_by=split_by,
split_length=split_length,
split_overlap=split_overlap,
split_answers=split_answers,
split_respect_sentence_boundary=split_respect_sentence_boundary,
)

Expand All @@ -110,6 +112,7 @@ def __init__(
self.split_respect_sentence_boundary = split_respect_sentence_boundary
self.language = iso639_to_nltk.get(language, language)
self.print_log: Set[str] = set()
self.split_answers = split_answers

def process(
self,
Expand Down Expand Up @@ -160,6 +163,7 @@ def _process_single(
split_by: Optional[str] = None,
split_length: Optional[int] = None,
split_overlap: Optional[int] = None,
split_answers: Optional[int] = None,
split_respect_sentence_boundary: Optional[bool] = None,
) -> List[dict]:

Expand All @@ -177,6 +181,8 @@ def _process_single(
split_overlap = self.split_overlap
if split_respect_sentence_boundary is None:
split_respect_sentence_boundary = self.split_respect_sentence_boundary
if (split_answers is None):
split_answers = self.split_answers

cleaned_document = self.clean(
document=document,
Expand All @@ -189,6 +195,7 @@ def _process_single(
split_by=split_by,
split_length=split_length,
split_overlap=split_overlap,
split_answers=split_answers,
split_respect_sentence_boundary=split_respect_sentence_boundary,
)
return split_documents
Expand Down Expand Up @@ -240,6 +247,7 @@ def split(
split_by: str,
split_length: int,
split_overlap: int,
split_answers: bool,
split_respect_sentence_boundary: bool,
) -> List[dict]:
"""Perform document splitting on a single document. This method can split on different units, at different lengths,
Expand Down Expand Up @@ -303,7 +311,10 @@ def split(
text_splits.append(txt)
else:
# create individual "elements" of passage, sentence, or word
if split_by == "passage":
# Faq text need to split text by '\n' of a passage
if split_answers and split_by == "passage":
text_splits = text.split("\n")
elif split_by == "passage":
elements = text.split("\n\n")
elif split_by == "sentence":
elements = nltk.tokenize.sent_tokenize(text,
Expand All @@ -316,25 +327,45 @@ def split(
)

# concatenate individual elements based on split_length & split_stride
if split_overlap:
segments = windowed(elements,
n=split_length,
step=split_length - split_overlap)
else:
segments = windowed(elements, n=split_length, step=split_length)
text_splits = []
for seg in segments:
txt = " ".join([t for t in seg if t is not None])
if len(txt) > 0:
text_splits.append(txt)

# FAQ text process don't need split text into fix lengths
if (not split_answers):
if split_overlap:
segments = windowed(elements,
n=split_length,
step=split_length - split_overlap)
else:
segments = windowed(elements,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个if else 看起来是不必要,直接通过split_overlap数值来决定就行

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

已修改

n=split_length,
step=split_length)
text_splits = []
for seg in segments:
txt = " ".join([t for t in seg if t is not None])
if len(txt) > 0:
text_splits.append(txt)
# create new document dicts for each text split
documents = []
for i, txt in enumerate(text_splits):
doc = deepcopy(document)
doc["content"] = txt

if "meta" not in doc.keys() or doc["meta"] is None:
doc["meta"] = {}
if (split_answers):
text_arr = doc["content"].split('\t')
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

split_answers 具体是代表啥含义了?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FAQ智能问答需要读取文件的问答对,split_answers表示把问答对按照\t分开

if (len(text_arr) > 2):
raise Exception(
"Each line text must be two columns and separated by \t"
)
# Maybe empty lines
if (len(text_arr) == 1):
logger.info(
'Some lines in your text cannot parse into question and text, maybe empty lines'
)
continue
else:
query, answer = text_arr
doc["content"] = query
doc["meta"]["answer"] = answer
doc["meta"]["_split_id"] = i
documents.append(doc)

Expand Down
5 changes: 3 additions & 2 deletions pipelines/rest_api/pipeline/dense_faq.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,9 @@ components: # define all the building-blocks for Pipeline
- name: Preprocessor
type: PreProcessor
params:
split_by: word
split_length: 1000
split_by: passage
split_respect_sentence_boundary: False
split_answers: True
- name: FileTypeClassifier
type: FileTypeClassifier

Expand Down
27 changes: 21 additions & 6 deletions pipelines/ui/webapp_faq.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,20 @@ def on_change_text():
st.session_state.raw_json = None


def upload():
data_files = st.session_state.upload_files['files']
for data_file in data_files:
# Upload file
if data_file and data_file.name not in st.session_state.upload_files[
'uploaded_files']:
raw_json = upload_doc(data_file)
st.session_state.upload_files['uploaded_files'].append(
data_file.name)
# Save the uploaded files
st.session_state.upload_files['uploaded_files'] = list(
set(st.session_state.upload_files['uploaded_files']))


def main():

st.set_page_config(
Expand All @@ -68,6 +82,7 @@ def main():
set_state_if_absent("results", None)
set_state_if_absent("raw_json", None)
set_state_if_absent("random_question_requested", False)
set_state_if_absent("upload_files", {'uploaded_files': [], 'files': []})

# Small callback to reset the interface in case the text of the question changes
def reset_results(*args):
Expand Down Expand Up @@ -101,13 +116,13 @@ def reset_results(*args):
data_files = st.sidebar.file_uploader(
"",
type=["pdf", "txt", "docx", "png"],
help="文件上传",
help="选择多个文件",
accept_multiple_files=True)
for data_file in data_files:
# Upload file
if data_file:
raw_json = upload_doc(data_file)
st.sidebar.write(str(data_file.name) + "    ✅ ")
st.session_state.upload_files['files'] = data_files
st.sidebar.button("文件上传", on_click=upload)
for data_file in st.session_state.upload_files['uploaded_files']:
st.sidebar.write(str(data_file) + "    ✅ ")

hs_version = ""
try:
hs_version = f" <small>(v{pipelines_version()})</small>"
Expand Down