This repository was archived by the owner on Dec 21, 2018. It is now read-only.
-
-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathCountPattern.js
116 lines (100 loc) · 3.23 KB
/
CountPattern.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import {ingest, Transform} from 'vega-dataflow';
import {inherits} from 'vega-util';
/**
* Count regexp-defined pattern occurrences in a text field.
* @constructor
* @param {object} params - The parameters for this operator.
* @param {function(object): *} params.field - An accessor for the text field.
* @param {string} [params.pattern] - RegExp string defining the text pattern.
* @param {string} [params.case] - One of 'lower', 'upper' or null (mixed) case.
* @param {string} [params.stopwords] - RegExp string of words to ignore.
*/
export default function CountPattern(params) {
Transform.call(this, null, params);
}
CountPattern.Definition = {
"type": "CountPattern",
"metadata": {"generates": true, "changes": true},
"params": [
{ "name": "field", "type": "field", "required": true },
{ "name": "case", "type": "enum", "values": ["upper", "lower", "mixed"], "default": "mixed" },
{ "name": "pattern", "type": "string", "default": "[\\w\"]+" },
{ "name": "stopwords", "type": "string", "default": "" },
{ "name": "as", "type": "string", "array": true, "length": 2, "default": ["text", "count"] }
]
};
function tokenize(text, tcase, match) {
switch (tcase) {
case 'upper': text = text.toUpperCase(); break;
case 'lower': text = text.toLowerCase(); break;
}
return text.match(match);
}
var prototype = inherits(CountPattern, Transform);
prototype.transform = function(_, pulse) {
function process(update) {
return function(tuple) {
var tokens = tokenize(get(tuple), _.case, match) || [], t;
for (var i=0, n=tokens.length; i<n; ++i) {
if (!stop.test(t = tokens[i])) update(t);
}
};
}
var init = this._parameterCheck(_, pulse),
counts = this._counts,
match = this._match,
stop = this._stop,
get = _.field,
as = _.as || ['text', 'count'],
add = process(function(t) { counts[t] = 1 + (counts[t] || 0); }),
rem = process(function(t) { counts[t] -= 1; });
if (init) {
pulse.visit(pulse.SOURCE, add);
} else {
pulse.visit(pulse.ADD, add);
pulse.visit(pulse.REM, rem);
}
return this._finish(pulse, as); // generate output tuples
};
prototype._parameterCheck = function(_, pulse) {
var init = false;
if (_.modified('stopwords') || !this._stop) {
this._stop = new RegExp('^' + (_.stopwords || '') + '$', 'i');
init = true;
}
if (_.modified('pattern') || !this._match) {
this._match = new RegExp((_.pattern || '[\\w\']+'), 'g');
init = true;
}
if (_.modified('field') || pulse.modified(_.field.fields)) {
init = true;
}
if (init) this._counts = {};
return init;
};
prototype._finish = function(pulse, as) {
var counts = this._counts,
tuples = this._tuples || (this._tuples = {}),
text = as[0],
count = as[1],
out = pulse.fork(pulse.NO_SOURCE | pulse.NO_FIELDS),
w, t, c;
for (w in counts) {
t = tuples[w];
c = counts[w] || 0;
if (!t && c) {
tuples[w] = (t = ingest({}));
t[text] = w;
t[count] = c;
out.add.push(t);
} else if (c === 0) {
if (t) out.rem.push(t);
counts[w] = null;
tuples[w] = null;
} else if (t[count] !== c) {
t[count] = c;
out.mod.push(t);
}
}
return out.modifies(as);
};