Skip to content

Commit

Permalink
unique texts
Browse files Browse the repository at this point in the history
  • Loading branch information
mgraffg committed Jan 12, 2024
1 parent 98c599c commit 465ee10
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 3 deletions.
2 changes: 1 addition & 1 deletion EvoMSA/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__version__ = '2.0.3'
__version__ = '2.0.4'

try:
from EvoMSA.text_repr import BoW, TextRepresentations, StackGeneralization, DenseBoW
Expand Down
21 changes: 20 additions & 1 deletion EvoMSA/tests/test_text_repr.py
Original file line number Diff line number Diff line change
Expand Up @@ -612,4 +612,23 @@ def test_DenseBoW_tailored():
n_jobs=-1)
assert len(dense.names) == 0
dense.text_representations_extend('IberLEF2023_DAVINCIS_task1')
assert len(dense.names)
assert len(dense.names)


def test_unique():
"""Test unique function"""
from EvoMSA.text_repr import unique

D = ['hola buenos dias', 'peticion', 'la vida', 'peticion',
'adios colegas', 'la vida', 'comiendo en el salón', 'adios colegas',
'nuevo', 'la vida', 'nuevo', 'hola buenos dias']
actual = np.array([0, 1, 2, 4, 6, 8])
index = unique(D, lang='es', return_index=True, batch_size=4)
assert np.all(index == actual)
index = unique(D, lang='es', return_index=False, batch_size=4)
assert isinstance(index, list) and len(index) == actual.shape[0]
D = list(tweet_iterator(TWEETS))
D[-1] = D[0]
index = unique(D, lang='es', batch_size=11)
assert index[-1] == 998

59 changes: 58 additions & 1 deletion EvoMSA/text_repr.py
Original file line number Diff line number Diff line change
Expand Up @@ -1058,4 +1058,61 @@ def __sklearn_clone__(self):
for x in self.decision_function_models]
params['transform_models'] = [clone(x)
for x in self.transform_models]
return klass(**params)
return klass(**params)


def unique(D: List[Union[dict, list]],
lang: str='es',
return_index: bool=True,
alpha: float=0.95,
batch_size: int=1024):
"""Compute the unique elements in a set using :py:class:`~EvoMSA.text_repr.BoW`
:param D: Texts; in the case, it is a list of dictionaries the text is on the key :py:attr:`BoW.key`
:type D: List of texts or dictionaries.
:param lang: Language.
:type lang: str
:param return_index: Return the indexes.
:type return_index: bool
:param alpha: Value to assert similarity.
:type alpha: float
:param batch_size: Batch size
:type batch_size: int
"""

def unique_self(elementos):
_ = X[elementos]
sim = np.dot(_, _.T) >= alpha
indptr = sim.indptr
indices = sim.indices
remove = []
index = np.where(np.diff(indptr) > 1)[0]
init = indptr[index]
index = index[index == indices[init]]
for i, j in zip(index, index+1):
remove.extend(indices[indptr[i]:indptr[j]][1:].tolist())
remove = set(remove)
_ = [i for k, i in enumerate(elementos)
if k not in remove]
return np.array(_)


def unique_rest(frst, rest):
sim = np.dot(X[rest], X[frst].T).max(axis=1).toarray()
mask = sim.flatten() < alpha
return rest[mask]

X = BoW(lang=lang).transform(D)
init = 0
pool = np.arange(len(D))
while init < pool.shape[0]:
past = pool[:init]
elementos = pool[init:init + batch_size]
rest = pool[init + batch_size:]
frst = unique_self(elementos)
rest = unique_rest(frst, rest)
init = init + frst.shape[0]
pool = np.r_[past, frst, rest]
if return_index:
return pool
return [D[x] for x in pool]

0 comments on commit 465ee10

Please sign in to comment.