sequence index

ckrause · ckrause · commit b8b758020ca1 · 2023-03-04T09:55:09.000+01:00
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+    "editor.formatOnSave": true
+}
diff --git a/loda/oeis.py b/loda/oeis.py
@@ -1,39 +1,141 @@
 # -*- coding: utf-8 -*-
 
+import copy
+import functools
 import os.path
+import re
+
 from loda.lang import Program
 
 
+@functools.total_ordering
 class Sequence:
     def __init__(self, id: int, name="", terms=[]):
         self.id = id
         self.name = name
         self.terms = terms
 
+    def __str__(self) -> str:
+        return "{}: {}".format(self.id_str(), self.name)
+
+    def __eq__(self, other) -> bool:
+        return self.id == other.id and self.terms == other.terms
+
+    def __lt__(self, other) -> bool:
+        if self.terms < other.terms:
+            return True
+        if self.terms == other.terms:
+            return self.id < other.id
+        return False
+
     def id_str(self) -> str:
         return "A{:06}".format(self.id)
 
 
-class SequenceCache:
-    def __init__(self, path: str, auto_fetch = False):
+class SequenceMatch:
+    def __init__(self, size: int):
+        self.prefix_length = 0
+        self.start_index = 0
+        self.end_index = size  # exclusive
+        self.finished_ids = []
+
+
+class SequenceIndex:
+
+    def __init__(self, path: str):
         self.__path = path
-        self.__auto_fetch = auto_fetch
-        self.__cache = None
+        self.__index = None
+        self.__lookup = None
 
-    def __fetch():
-        # TODO
-        pass
+    def size(self) -> int:
+        if self.__index is None:
+            self.__load()
+        return len(self.__index)
+
+    def get(self, id: int):
+        if self.__index is None:
+            self.__load()
+        return copy.copy(self.__get(id))
+
+    def __get(self, id: int):
+        return self.__index[self.__lookup[id]]
 
-    def __load():
-        # TODO
-        pass
+    def __parse_line(self, line: str, pattern):
+        line = line.strip()
+        if len(line) == 0 or line.startswith("#"):
+            return None
+        match = pattern.match(line)
+        if not match:
+            raise ValueError("parse error: {}".format(line))
+        return match
 
-    def get(self, id: int, use_b_file=False) -> Sequence:
-        if self.__cache is None:
+    def __load(self):
+        seqs = []
+        # load sequence terms
+        stripped = os.path.join(self.__path, "stripped")
+        expected_id = 1
+        with open(stripped) as file:
+            pattern = re.compile("^A([0-9]+) ,([0-9,]+),$")
+            for line in file:
+                match = self.__parse_line(line, pattern)
+                if not match:
+                    continue
+                id = int(match.group(1))
+                if id != expected_id:
+                    raise ValueError("unexpected ID: {}".format(line))
+                terms_str = match.group(2).split(",")
+                terms = [int(t) for t in terms_str]
+                seqs.append(Sequence(id, "", terms))
+                expected_id += 1
+        # load sequence names
+        names = os.path.join(self.__path, "names")
+        expected_id = 1
+        with open(names) as file:
+            pattern = re.compile("^A([0-9]+) (.+)$")
+            for line in file:
+                match = self.__parse_line(line, pattern)
+                if not match:
+                    continue
+                id = int(match.group(1))
+                if id != expected_id:
+                    raise ValueError("unexpected ID: {}".format(line))
+                name = match.group(2)
+                seqs[id - 1].name = name
+                expected_id += 1
+        self.__index = sorted(seqs)
+        self.__lookup = [0] * (len(seqs) + 1)
+        for i in range(len(seqs)):
+            id = self.__index[i].id
+            self.__lookup[id] = i
+
+    def global_match(self) -> SequenceMatch:
+        if self.__index is None:
             self.__load()
-        # TODO
-        if use_b_file:
-            pass
+        return SequenceMatch(len(self.__index))
+
+    def refine_match(self, match: SequenceMatch, term: int) -> bool:
+        if match.start_index >= match.end_index:
+            return False
+        arg = match.prefix_length
+        match.prefix_length += 1
+        new_start = match.start_index
+        while new_start < match.end_index and self.__index[new_start].terms[arg] < term:
+            new_start += 1
+        while new_start < match.end_index and self.__index[new_start].terms[arg] == term and len(self.__index[new_start].terms) == match.prefix_length:
+            match.finished_ids.append(self.__index[new_start].id)
+            new_start += 1
+        new_end = new_start
+        while new_end < match.end_index and self.__index[new_end].terms[arg] == term:
+            new_end += 1
+        match.start_index = new_start
+        match.end_index = new_end
+        return new_start < new_end
+
+    def get_match_ids(self, match: SequenceMatch) -> list[int]:
+        ids = [self.__index[i].id for i in range(
+            match.start_index, match.end_index)]
+        ids.extend(match.finished_ids)
+        return sorted(ids)
 
 
 class ProgramCache:
@@ -47,7 +149,7 @@ def path(self, id: int) -> str:
         asm = "{}.asm".format(Sequence(id).id_str())
         return os.path.join(self.__path, dir, asm)
 
-    def get(self, id: int) -> Program:
+    def get(self, id: int):
         if id not in self.__cache:
             with open(self.path(id), "r") as file:
                 self.__cache[id] = Program(file.read())
diff --git a/sample.py b/sample.py
@@ -3,9 +3,8 @@
 from loda.oeis import ProgramCache
 from loda.runtime import Interpreter
 
-# Sample program using the LODA Pathon module
-if __name__ == "__main__":
 
+def basic_loda():
     # Initialize LODA programs cache using *.asm files from tests folder
     program_dir = os.path.join('tests', 'programs', 'oeis')
     program_cache = ProgramCache(program_dir)
@@ -19,3 +18,7 @@
     interpreter = Interpreter(program_cache=program_cache)
     sequence, _ = interpreter.eval_to_seq(program, num_terms=20)
     print(sequence)
+
+
+if __name__ == "__main__":
+    basic_loda()
diff --git a/tests/helpers.py b/tests/helpers.py
@@ -4,6 +4,7 @@
 import os.path
 from loda.oeis import Sequence
 
+OEIS_TEST_DIR = os.path.join('tests', 'oeis')
 OPERATIONS_TEST_DIR = os.path.join('tests', 'operations')
 PROGRAMS_TEST_DIR = os.path.join('tests', 'programs', 'oeis')
 
diff --git a/tests/oeis/names b/tests/oeis/names
@@ -0,0 +1,9 @@
+# OEIS Sequence Names (http://oeis.org/names.gz)
+# Last Modified: February 19 00:58 EST 2023
+# Use of this content is governed by the
+# OEIS End-User License: http://oeis.org/LICENSE
+A000001 Number of groups of order n.
+A000002 Kolakoski sequence: a(n) is length of n-th run; a(1) = 1; sequence consists just of 1's and 2's.
+A000003 Number of classes of primitive positive definite binary quadratic forms of discriminant D = -4n; or equivalently the class number of the quadratic order of discriminant D = -4n.
+A000004 The zero sequence.
+A000005 d(n) (also called tau(n) or sigma_0(n)), the number of divisors of n.
diff --git a/tests/oeis/stripped b/tests/oeis/stripped
@@ -0,0 +1,9 @@
+# OEIS Sequence Data (http://oeis.org/stripped.gz)
+# Last Modified: February 19 05:58 UTC 2023
+# Use of this content is governed by the
+# OEIS End-User License: http://oeis.org/LICENSE
+A000001 ,0,1,1,1,2,1,2,1,5,2,2,1,5,1,2,1,14,1,5,1,5,2,2,1,15,2,2,5,4,1,4,1,51,1,2,1,14,1,2,2,14,1,6,1,4,2,2,1,52,2,5,1,5,1,15,2,13,2,2,1,13,1,2,4,267,1,4,1,5,1,4,1,50,1,2,3,4,1,6,1,52,15,2,1,15,1,2,1,12,1,10,1,4,2,
+A000002 ,1,2,2,1,1,2,1,2,2,1,2,2,1,1,2,1,1,2,2,1,2,1,1,2,1,2,2,1,1,2,1,1,2,1,2,2,1,2,2,1,1,2,1,2,2,1,2,1,1,2,1,1,2,2,1,2,2,1,1,2,1,2,2,1,2,2,1,1,2,1,1,2,1,2,2,1,2,1,1,2,2,1,2,2,1,1,2,1,2,2,1,2,2,1,1,2,1,1,2,2,1,2,1,1,2,1,2,2,
+A000003 ,1,1,1,1,2,2,1,2,2,2,3,2,2,4,2,2,4,2,3,4,4,2,3,4,2,6,3,2,6,4,3,4,4,4,6,4,2,6,4,4,8,4,3,6,4,4,5,4,4,6,6,4,6,6,4,8,4,2,9,4,6,8,4,4,8,8,3,8,8,4,7,4,4,10,6,6,8,4,5,8,6,4,9,8,4,10,6,4,12,8,6,6,4,8,8,8,4,8,6,4,
+A000004 ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+A000005 ,1,2,2,3,2,4,2,4,3,4,2,6,2,4,4,5,2,6,2,6,4,4,2,8,3,4,4,6,2,8,2,6,4,4,4,9,2,4,4,8,2,8,2,6,6,4,2,10,3,6,4,6,2,8,4,8,4,4,2,12,2,4,6,7,4,8,2,6,4,8,2,12,2,4,6,6,4,8,2,10,5,4,2,12,4,4,4,8,2,12,4,6,4,4,4,12,2,6,6,9,2,8,2,8,
diff --git a/tests/test_oeis.py b/tests/test_oeis.py
@@ -0,0 +1,72 @@
+# -*- coding: utf-8 -*-
+
+from unittest import TestCase
+
+from loda.oeis import SequenceIndex, Sequence
+from tests.helpers import OEIS_TEST_DIR
+
+NUM_SEQS = 5
+
+
+class SequenceIndexTests(TestCase):
+
+    def setUp(self):
+        self.index = SequenceIndex(OEIS_TEST_DIR)
+
+    def test_index_size(self):
+        self.assertEqual(NUM_SEQS, self.index.size())
+
+    def test_index_get_A000004(self):
+        a4: Sequence = self.index.get(4)
+        self.assertEqual(4, a4.id)
+        self.assertEqual("The zero sequence.", a4.name)
+        self.assertEqual([0]*102, a4.terms)
+
+    def test_index_get_A000005(self):
+        a5: Sequence = self.index.get(5)
+        self.assertEqual(5, a5.id)
+        self.assertEqual(
+            "d(n) (also called tau(n) or sigma_0(n)), the number of divisors of n.", a5.name)
+        self.assertEqual([1, 2, 2, 3, 2, 4, 2, 4, 3, 4, 2, 6, 2, 4,
+                          4, 5, 2, 6, 2, 6, 4, 4, 2, 8, 3, 4, 4, 6,
+                          2, 8, 2, 6, 4, 4, 4, 9, 2, 4, 4, 8, 2, 8,
+                          2, 6, 6, 4, 2, 10, 3, 6, 4, 6, 2, 8, 4, 8,
+                          4, 4, 2, 12, 2, 4, 6, 7, 4, 8, 2, 6, 4, 8,
+                          2, 12, 2, 4, 6, 6, 4, 8, 2, 10, 5, 4, 2, 12,
+                          4, 4, 4, 8, 2, 12, 4, 6, 4, 4, 4, 12, 2, 6,
+                          6, 9, 2, 8, 2, 8], a5.terms)
+
+    def test_global_match(self):
+        m = self.index.global_match()
+        expected = [i+1 for i in range(NUM_SEQS)]
+        self.assertEqual(expected, self.index.get_match_ids(m))
+
+    def test_refine_match_A000001(self):
+        self.__test_refine([
+            (0, [1, 4], True),
+            (1, [1], True),
+            (1, [1], True),
+            (47, [], False),  # test incorrect term
+        ])
+
+    def test_refine_match_A000004(self):
+        refinements = [(0, [1, 4], True)]
+        refinements.extend([(0, [4], True)] * 100)
+        refinements.append((0, [4], False))
+        self.__test_refine(refinements)
+
+    def test_refine_match_A000005(self):
+        self.__test_refine([
+            (1, [2, 3, 5], True),
+            (2, [2, 5], True),
+            (2, [2, 5], True),
+            (3, [5], True),
+            (2, [5], True),
+            (47, [], False),  # test incorrect term
+        ])
+
+    def __test_refine(self, refinements):
+        m = self.index.global_match()
+        for (term, expected_ids, more) in refinements:
+            self.assertEqual(more, self.index.refine_match(m, term))
+            self.assertEqual(expected_ids, self.index.get_match_ids(m))

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+{`
	`2`	`+ "editor.formatOnSave": true`
	`3`	`+}`