Skip to content

Commit 3c12e55

Browse files
committed
ner + textline work
1 parent 75c2c85 commit 3c12e55

File tree

10 files changed

+363
-87
lines changed

10 files changed

+363
-87
lines changed

.dockerignore

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
data/*
2+
*.egg_info
3+
venv
4+
models
5+
*.tar.gz

Dockerfile

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,4 @@ COPY . /usr/src/qurator-mono-repo
1717
RUN pip3 --no-cache-dir install -e /usr/src/qurator-mono-repo
1818

1919
WORKDIR /usr/src/qurator-mono-repo
20-
CMD export LANG=C.UTF-8; env FLASK_APP=qurator/qurator_sbb_ner/webapp/app.py env FLASK_ENV=development env USE_CUDA=True flask run --host=0.0.0.0
20+
CMD export LANG=C.UTF-8; env FLASK_APP=qurator/sbb_ner/webapp/app.py env FLASK_ENV=development env USE_CUDA=True flask run --host=0.0.0.0

Dockerfile.cpu

+7-4
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,12 @@ RUN apt-get update && \
1111
COPY requirements.txt /tmp
1212
RUN pip3 --no-cache-dir install -r /tmp/requirements.txt
1313

14-
COPY . /usr/src/qurator-mono-repo
14+
COPY . /usr/src/qurator-sbb-ner
1515

16-
RUN pip3 --no-cache-dir install -e /usr/src/qurator-mono-repo
16+
RUN mkdir -p /usr/src/qurator-sbb-ner/konvens2019
17+
RUN mkdir -p /usr/src/qurator-sbb-ner/digisam
1718

18-
WORKDIR /usr/src/qurator-mono-repo
19-
CMD env FLASK_APP=qurator/qurator_sbb_ner/webapp/app.py env FLASK_ENV=development env USE_CUDA=False flask run --host=0.0.0.0
19+
RUN pip3 --no-cache-dir install -e /usr/src/qurator-sbb-ner
20+
21+
WORKDIR /usr/src/qurator-sbb-ner
22+
CMD env FLASK_APP=qurator/sbb_ner/webapp/app.py env FLASK_ENV=development env USE_CUDA=False flask run --host=0.0.0.0

Makefile

+4
Original file line numberDiff line numberDiff line change
@@ -510,3 +510,7 @@ wikipedia-evaluation: $(BUILD_PATH)/wikipedia-de-finetuned/eval_results-LFT.pkl
510510
wikipedia-evaluation2: $(BUILD_PATH)/wikipedia-de-finetuned/eval_results-SBB.pkl
511511
wikipedia-evaluation3: $(BUILD_PATH)/wikipedia-de-finetuned/eval_results-DE-CONLL-TESTA.pkl
512512

513+
###############################
514+
515+
model_archive:
516+
tar --exclude='*ep[1-6]*' --exclude='*eval*' --exclude='pytorch_model.bin' --exclude='*.pkl' -chzf models.tar.gz data/konvens2019/build-wd_0.03/bert-all-german-de-finetuned data/konvens2019/build-on-all-german-de-finetuned/bert-sbb-de-finetuned data/konvens2019/build-wd_0.03/bert-sbb-de-finetuned data/konvens2019/build-wd_0.03/bert-all-german-baseline

qurator/sbb_ner/webapp/app.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -317,12 +317,11 @@ def ner(model_id):
317317

318318
output = []
319319

320-
word = None
321-
last_prediction = 'O'
322-
323320
for tokens, word_predictions in prediction:
324321

322+
word = None
325323
last_prediction = 'O'
324+
output_sentence = []
326325

327326
for token, word_pred in zip(tokens, word_predictions):
328327

@@ -331,7 +330,7 @@ def ner(model_id):
331330

332331
if not token.startswith('##'):
333332
if word is not None:
334-
output.append({'word': word, 'prediction': last_prediction})
333+
output_sentence.append({'word': word, 'prediction': last_prediction})
335334

336335
word = ''
337336

@@ -342,8 +341,10 @@ def ner(model_id):
342341
if word_pred != 'X':
343342
last_prediction = word_pred
344343

345-
if word is not None and len(word) > 0:
346-
output.append({'word': word, 'prediction': last_prediction})
344+
if word is not None and len(word) > 0:
345+
output_sentence.append({'word': word, 'prediction': last_prediction})
346+
347+
output.append(output_sentence)
347348

348349
return jsonify(output)
349350

qurator/sbb_ner/webapp/static/index.html

+9-7
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
<link rel="stylesheet" href="css/bootstrap.min.css"
1010
integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
1111

12-
<title>NER auf den digitalen Sammlungen</title>
12+
<title>NER - Demo </title>
1313
<script src="js/jquery-3.4.1.js"></script>
1414
</head>
1515
<body>
@@ -22,7 +22,7 @@
2222
<div class="col-10">
2323
<div class="row">
2424
<div class="col-9 text-center">
25-
<h1>NER auf den digitalen Sammlungen</h1>
25+
<h1>NER - Demo</h1>
2626
</div>
2727
<div class="col">
2828
</div>
@@ -35,7 +35,6 @@ <h1>NER auf den digitalen Sammlungen</h1>
3535
<div class="form-group row ml-2">
3636
<label for="task" class="col-sm-2 col-form-label">Task:</label>
3737
<select id="task" class="selectpicker col-md-auto" onchange="task_select()">
38-
<option value="1">OCR-Text aus ALTO Datei</option>
3938
<option value="2">Wort- und Satztokenisierung</option>
4039
<option value="3" selected>Named Entity Recognition</option>
4140
<option value="4">BERT Tokens</option>
@@ -48,10 +47,13 @@ <h1>NER auf den digitalen Sammlungen</h1>
4847
</div>
4948

5049
<div class="form-group row ml-2">
51-
<label for="ppn" class="col-sm-2 col-form-label">PPN:</label>
52-
<input id="ppn" list="ppnexamples" class="col-sm-8" type="text"/>
53-
<datalist id="ppnexamples">
54-
</datalist>
50+
<label for="inputtext" class="col-sm-2 col-form-label">Input text:</label>
51+
<!-- <input id="inputtext" class="col-sm-8" type="text" rows=10/> -->
52+
<textarea id="inputtext" class=" col-sm-8 form-control" rows="3" required></textarea>
53+
</div>
54+
55+
<div class="form-group row ml-2">
56+
<div class="col-sm-2"></div>
5557
<button class="btn btn-primary" type="submit">Go</button>
5658
</div>
5759
</form>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
2+
$(document).ready(function(){
3+
4+
$('#nerform').submit(
5+
function(e){
6+
e.preventDefault();
7+
load_ppn();
8+
}
9+
);
10+
11+
$.get( "/models")
12+
.done(
13+
function( data ) {
14+
var tmp="";
15+
$.each(data,
16+
function(index, item){
17+
18+
selected=""
19+
if (item.default) {
20+
selected = "selected"
21+
}
22+
23+
tmp += '<option value="' + item.id + '" ' + selected + ' >' + item.name + '</option>'
24+
});
25+
$('#model').html(tmp);
26+
}
27+
);
28+
29+
$.get( "/ppnexamples")
30+
.done(
31+
function( data ) {
32+
var tmp="";
33+
$.each(data,
34+
function(index, item){
35+
36+
tmp += '<option value="' + item.ppn + '">' + item.name + '</option>'
37+
});
38+
$('#ppnexamples').html(tmp);
39+
}
40+
);
41+
42+
task_select()
43+
});
44+
45+
function task_select() {
46+
47+
var task = $('#task').val();
48+
49+
if (task < 3) {
50+
$('#model_select').hide()
51+
}
52+
else {
53+
$('#model_select').show()
54+
}
55+
56+
$("#resultregion").html("");
57+
$("#legende").html("");
58+
}
59+
60+
61+
function load_ppn() {
62+
63+
var ppn = $('#ppn').val()
64+
65+
var text_region_html =
66+
`<div class="card">
67+
<div class="card-header">
68+
Ergebnis:
69+
</div>
70+
<div class="card-block">
71+
<div id="textregion" style="overflow-y:scroll;height: 65vh;"></div>
72+
</div>
73+
</div>`;
74+
75+
var legende_html =
76+
`<div class="card">
77+
<div class="card-header">
78+
Legende:
79+
<div class="ml-2" >[<font color="red">Person</font>]</div>
80+
<div class="ml-2" >[<font color="green">Ort</font>]</div>
81+
<div class="ml-2" >[<font color="blue">Organisation</font>]</div>
82+
<div class="ml-2" >[keine Named Entity]</div>
83+
</div>
84+
</div>`;
85+
86+
var spinner_html =
87+
`<div class="d-flex justify-content-center">
88+
<div class="spinner-border align-center" role="status">
89+
<span class="sr-only">Loading...</span>
90+
</div>
91+
</div>`;
92+
93+
$("#legende").html("");
94+
95+
var task = $('#task').val();
96+
var model_id = $('#model').val();
97+
98+
console.log("Task: " + task);
99+
100+
if (task == 1) {
101+
$("#resultregion").html(spinner_html);
102+
103+
$.get( "/digisam-fulltext/" + ppn)
104+
.done(function( data ) {
105+
$("#resultregion").html(text_region_html)
106+
$("#textregion").html(data.text)
107+
})
108+
.fail(
109+
function() {
110+
console.log('Failed.');
111+
$("#resultregion").html('Failed.');
112+
});
113+
}
114+
else if (task == 2) {
115+
$("#resultregion").html(spinner_html);
116+
117+
$.get( "/digisam-tokenized/" + ppn,
118+
function( data ) {
119+
$("#resultregion").html(text_region_html)
120+
$("#textregion").html(data.text)
121+
}).fail(
122+
function() {
123+
console.log('Failed.')
124+
$("#resultregion").html('Failed.')
125+
});
126+
}
127+
else if (task == 3) {
128+
129+
$("#resultregion").html(spinner_html);
130+
131+
$.get( "/digisam-ner/" + model_id + "/" + ppn,
132+
function( data ) {
133+
$("#resultregion").html(text_region_html)
134+
$("#textregion").html(data.text)
135+
$("#legende").html(legende_html)
136+
}).fail(
137+
function(a,b,c) {
138+
console.log('Failed.')
139+
$("#resultregion").html('Failed.')
140+
});
141+
}
142+
else if (task == 4) {
143+
$("#resultregion").html(spinner_html);
144+
145+
$.get( "/digisam-ner-bert-tokens/" + model_id + "/" + ppn,
146+
function( data ) {
147+
$("#resultregion").html(text_region_html)
148+
$("#textregion").html(data.text)
149+
}).fail(
150+
function(a,b,c) {
151+
console.log('Failed.')
152+
$("#resultregion").html('Failed.')
153+
});
154+
}
155+
}

0 commit comments

Comments
 (0)