From 465ee1091798a96ec741e255d5a45b56674d396c Mon Sep 17 00:00:00 2001 From: Mario Graff Date: Thu, 11 Jan 2024 21:37:05 -0600 Subject: [PATCH] unique texts --- EvoMSA/__init__.py | 2 +- EvoMSA/tests/test_text_repr.py | 21 +++++++++++- EvoMSA/text_repr.py | 59 +++++++++++++++++++++++++++++++++- 3 files changed, 79 insertions(+), 3 deletions(-) diff --git a/EvoMSA/__init__.py b/EvoMSA/__init__.py index f50ff98..39cf4e9 100644 --- a/EvoMSA/__init__.py +++ b/EvoMSA/__init__.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -__version__ = '2.0.3' +__version__ = '2.0.4' try: from EvoMSA.text_repr import BoW, TextRepresentations, StackGeneralization, DenseBoW diff --git a/EvoMSA/tests/test_text_repr.py b/EvoMSA/tests/test_text_repr.py index d06a82c..00f0c2c 100644 --- a/EvoMSA/tests/test_text_repr.py +++ b/EvoMSA/tests/test_text_repr.py @@ -612,4 +612,23 @@ def test_DenseBoW_tailored(): n_jobs=-1) assert len(dense.names) == 0 dense.text_representations_extend('IberLEF2023_DAVINCIS_task1') - assert len(dense.names) \ No newline at end of file + assert len(dense.names) + + +def test_unique(): + """Test unique function""" + from EvoMSA.text_repr import unique + + D = ['hola buenos dias', 'peticion', 'la vida', 'peticion', + 'adios colegas', 'la vida', 'comiendo en el salón', 'adios colegas', + 'nuevo', 'la vida', 'nuevo', 'hola buenos dias'] + actual = np.array([0, 1, 2, 4, 6, 8]) + index = unique(D, lang='es', return_index=True, batch_size=4) + assert np.all(index == actual) + index = unique(D, lang='es', return_index=False, batch_size=4) + assert isinstance(index, list) and len(index) == actual.shape[0] + D = list(tweet_iterator(TWEETS)) + D[-1] = D[0] + index = unique(D, lang='es', batch_size=11) + assert index[-1] == 998 + diff --git a/EvoMSA/text_repr.py b/EvoMSA/text_repr.py index 2443f45..a844a86 100644 --- a/EvoMSA/text_repr.py +++ b/EvoMSA/text_repr.py @@ -1058,4 +1058,61 @@ def __sklearn_clone__(self): for x in self.decision_function_models] params['transform_models'] = [clone(x) for x in self.transform_models] - return klass(**params) \ No newline at end of file + return klass(**params) + + +def unique(D: List[Union[dict, list]], + lang: str='es', + return_index: bool=True, + alpha: float=0.95, + batch_size: int=1024): + """Compute the unique elements in a set using :py:class:`~EvoMSA.text_repr.BoW` + + :param D: Texts; in the case, it is a list of dictionaries the text is on the key :py:attr:`BoW.key` + :type D: List of texts or dictionaries. + :param lang: Language. + :type lang: str + :param return_index: Return the indexes. + :type return_index: bool + :param alpha: Value to assert similarity. + :type alpha: float + :param batch_size: Batch size + :type batch_size: int + """ + + def unique_self(elementos): + _ = X[elementos] + sim = np.dot(_, _.T) >= alpha + indptr = sim.indptr + indices = sim.indices + remove = [] + index = np.where(np.diff(indptr) > 1)[0] + init = indptr[index] + index = index[index == indices[init]] + for i, j in zip(index, index+1): + remove.extend(indices[indptr[i]:indptr[j]][1:].tolist()) + remove = set(remove) + _ = [i for k, i in enumerate(elementos) + if k not in remove] + return np.array(_) + + + def unique_rest(frst, rest): + sim = np.dot(X[rest], X[frst].T).max(axis=1).toarray() + mask = sim.flatten() < alpha + return rest[mask] + + X = BoW(lang=lang).transform(D) + init = 0 + pool = np.arange(len(D)) + while init < pool.shape[0]: + past = pool[:init] + elementos = pool[init:init + batch_size] + rest = pool[init + batch_size:] + frst = unique_self(elementos) + rest = unique_rest(frst, rest) + init = init + frst.shape[0] + pool = np.r_[past, frst, rest] + if return_index: + return pool + return [D[x] for x in pool] \ No newline at end of file