From 465ee1091798a96ec741e255d5a45b56674d396c Mon Sep 17 00:00:00 2001
From: Mario Graff <mgraffg@ieee.org>
Date: Thu, 11 Jan 2024 21:37:05 -0600
Subject: [PATCH] unique texts

---
 EvoMSA/__init__.py             |  2 +-
 EvoMSA/tests/test_text_repr.py | 21 +++++++++++-
 EvoMSA/text_repr.py            | 59 +++++++++++++++++++++++++++++++++-
 3 files changed, 79 insertions(+), 3 deletions(-)

diff --git a/EvoMSA/__init__.py b/EvoMSA/__init__.py
index f50ff98..39cf4e9 100644
--- a/EvoMSA/__init__.py
+++ b/EvoMSA/__init__.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-__version__ = '2.0.3'
+__version__ = '2.0.4'
 
 try:
     from EvoMSA.text_repr import BoW, TextRepresentations, StackGeneralization, DenseBoW
diff --git a/EvoMSA/tests/test_text_repr.py b/EvoMSA/tests/test_text_repr.py
index d06a82c..00f0c2c 100644
--- a/EvoMSA/tests/test_text_repr.py
+++ b/EvoMSA/tests/test_text_repr.py
@@ -612,4 +612,23 @@ def test_DenseBoW_tailored():
                      n_jobs=-1)
     assert len(dense.names) == 0
     dense.text_representations_extend('IberLEF2023_DAVINCIS_task1')
-    assert len(dense.names)
\ No newline at end of file
+    assert len(dense.names)
+
+
+def test_unique():
+    """Test unique function"""
+    from EvoMSA.text_repr import unique
+
+    D = ['hola buenos dias', 'peticion', 'la vida', 'peticion',
+         'adios colegas', 'la vida', 'comiendo en el salón', 'adios colegas',
+         'nuevo', 'la vida', 'nuevo', 'hola buenos dias']
+    actual = np.array([0, 1, 2, 4, 6, 8])
+    index = unique(D, lang='es', return_index=True, batch_size=4)
+    assert np.all(index == actual)
+    index = unique(D, lang='es', return_index=False, batch_size=4)
+    assert isinstance(index, list) and len(index) == actual.shape[0]
+    D = list(tweet_iterator(TWEETS))
+    D[-1] = D[0]
+    index = unique(D, lang='es', batch_size=11)
+    assert index[-1] == 998
+
diff --git a/EvoMSA/text_repr.py b/EvoMSA/text_repr.py
index 2443f45..a844a86 100644
--- a/EvoMSA/text_repr.py
+++ b/EvoMSA/text_repr.py
@@ -1058,4 +1058,61 @@ def __sklearn_clone__(self):
                                               for x in self.decision_function_models]
         params['transform_models'] = [clone(x)
                                       for x in self.transform_models]
-        return klass(**params)
\ No newline at end of file
+        return klass(**params)
+    
+
+def unique(D: List[Union[dict, list]],
+           lang: str='es',
+           return_index: bool=True,
+           alpha: float=0.95,
+           batch_size: int=1024):
+    """Compute the unique elements in a set using :py:class:`~EvoMSA.text_repr.BoW`
+
+    :param D: Texts; in the case, it is a list of dictionaries the text is on the key :py:attr:`BoW.key`
+    :type D: List of texts or dictionaries.
+    :param lang: Language.
+    :type lang: str
+    :param return_index: Return the indexes.
+    :type return_index: bool
+    :param alpha: Value to assert similarity.
+    :type alpha: float
+    :param batch_size: Batch size
+    :type batch_size: int
+    """
+
+    def unique_self(elementos):
+        _ = X[elementos]
+        sim = np.dot(_, _.T) >= alpha
+        indptr = sim.indptr
+        indices = sim.indices
+        remove = []
+        index = np.where(np.diff(indptr) > 1)[0]
+        init = indptr[index]
+        index = index[index == indices[init]]
+        for i, j in zip(index, index+1):
+            remove.extend(indices[indptr[i]:indptr[j]][1:].tolist())
+        remove = set(remove)
+        _ = [i for k, i in enumerate(elementos)
+             if k not in remove]
+        return np.array(_)
+    
+
+    def unique_rest(frst, rest):
+        sim = np.dot(X[rest], X[frst].T).max(axis=1).toarray()
+        mask = sim.flatten() < alpha
+        return rest[mask]
+
+    X = BoW(lang=lang).transform(D)
+    init = 0
+    pool = np.arange(len(D))
+    while init < pool.shape[0]:
+        past = pool[:init]    
+        elementos = pool[init:init + batch_size]
+        rest = pool[init + batch_size:]
+        frst = unique_self(elementos)
+        rest = unique_rest(frst, rest)
+        init = init + frst.shape[0]
+        pool = np.r_[past, frst, rest]
+    if return_index:
+        return pool
+    return [D[x] for x in pool]
\ No newline at end of file