Skip to content

Commit a5ab58e

Browse files
authored
load b-files (#7)
1 parent 46045c2 commit a5ab58e

File tree

3 files changed

+112
-48
lines changed

3 files changed

+112
-48
lines changed

loda/oeis/sequence.py

+107-43
Original file line numberDiff line numberDiff line change
@@ -28,51 +28,115 @@ def __lt__(self, other) -> bool:
2828
def id_str(self) -> str:
2929
return "A{:06}".format(self.id)
3030

31+
@classmethod
32+
def load_oeis(cls, oeis_path: str) -> list:
33+
"""
34+
Load sequences from `stripped` from `names` files.
35+
"""
36+
seqs = []
37+
# load sequence terms
38+
stripped = os.path.join(oeis_path, "stripped")
39+
with open(stripped) as file:
40+
pattern = re.compile("^A([0-9]+) ,([\\-0-9,]+),$")
41+
for line in file:
42+
match = cls.__parse_line(line, pattern)
43+
if not match:
44+
continue
45+
id = int(match.group(1))
46+
cls.__fill_seqs(seqs, id)
47+
seqs[id].id = id
48+
terms_str = match.group(2).split(",")
49+
seqs[id].terms = [int(t) for t in terms_str]
50+
# load sequence names
51+
names = os.path.join(oeis_path, "names")
52+
with open(names) as file:
53+
pattern = re.compile("^A([0-9]+) (.+)$")
54+
for line in file:
55+
match = cls.__parse_line(line, pattern)
56+
if not match:
57+
continue
58+
id = int(match.group(1))
59+
cls.__fill_seqs(seqs, id)
60+
name = match.group(2)
61+
seqs[id].name = name
62+
return seqs
3163

32-
def __parse_line(line: str, pattern):
33-
line = line.strip()
34-
if len(line) == 0 or line.startswith("#"):
35-
return None
36-
match = pattern.match(line)
37-
if not match:
38-
raise ValueError("parse error: {}".format(line))
39-
return match
64+
@classmethod
65+
def __parse_line(cls, line: str, pattern):
66+
line = line.strip()
67+
if len(line) == 0 or line.startswith("#"):
68+
return None
69+
match = pattern.match(line)
70+
if not match:
71+
raise ValueError("parse error: {}".format(line))
72+
return match
4073

74+
@classmethod
75+
def __fill_seqs(cls, seqs: list, id: int):
76+
current_size = len(seqs)
77+
for i in range(current_size, id+2):
78+
seqs.append(Sequence(i, "", []))
4179

42-
def __fill_seqs(seqs: list, id: int):
43-
current_size = len(seqs)
44-
for i in range(current_size, id+2):
45-
seqs.append(Sequence(i, "", []))
80+
def load_b_file(self, path: str) -> list:
81+
"""
82+
Load additional terms from a b-file.
4683
84+
Args:
85+
path: Either path to a b-file (uncompressed `b*.txt` file) or a
86+
folder that contains the b-files in sub-directories, e.g. `b/123/b123456.txt`.
87+
"""
88+
terms = []
89+
if len(path) == 0 or os.path.isdir(path):
90+
dir = "{:03}".format(self.id//1000)
91+
txt = "b{:06}.txt".format(self.id)
92+
path = os.path.join(path, "b", dir, txt)
93+
with open(path) as b_file:
94+
expected_index = -1
95+
for line in b_file:
96+
line = line.strip()
97+
if len(line) == 0 or line[0] == "#":
98+
continue
99+
fields = line.split()
100+
if len(fields) < 2:
101+
raise ValueError("unexpected line: {}".format(line))
102+
index = int(fields[0])
103+
value = int(fields[1])
104+
if expected_index == -1:
105+
expected_index = index
106+
if index != expected_index:
107+
raise ValueError("unexpected index: {}".format(index))
108+
terms.append(value)
109+
expected_index += 1
110+
terms = self.__align(terms)
111+
if terms is None:
112+
raise ValueError("unexpected terms in b-file")
113+
if len(terms) < len(self.terms):
114+
terms = self.terms
115+
elif terms[0:len(self.terms)] != self.terms:
116+
raise ValueError("unexpected terms in b-file")
117+
return terms
47118

48-
def load(oeis_path: str) -> list:
49-
"""
50-
Load sequences from `stripped` from `names` files.
51-
"""
52-
seqs = []
53-
# load sequence terms
54-
stripped = os.path.join(oeis_path, "stripped")
55-
with open(stripped) as file:
56-
pattern = re.compile("^A([0-9]+) ,([\\-0-9,]+),$")
57-
for line in file:
58-
match = __parse_line(line, pattern)
59-
if not match:
60-
continue
61-
id = int(match.group(1))
62-
__fill_seqs(seqs, id)
63-
seqs[id].id = id
64-
terms_str = match.group(2).split(",")
65-
seqs[id].terms = [int(t) for t in terms_str]
66-
# load sequence names
67-
names = os.path.join(oeis_path, "names")
68-
with open(names) as file:
69-
pattern = re.compile("^A([0-9]+) (.+)$")
70-
for line in file:
71-
match = __parse_line(line, pattern)
72-
if not match:
73-
continue
74-
id = int(match.group(1))
75-
__fill_seqs(seqs, id)
76-
name = match.group(2)
77-
seqs[id].name = name
78-
return seqs
119+
def __align(self, terms: list, max_offset: int = 10) -> list:
120+
"""Align terms from a b-file possible by shifting by an offset"""
121+
# check if they agree on prefix already
122+
min_length = min(len(self.terms), len(terms))
123+
if self.terms[0:min_length] == terms[0:min_length]:
124+
return terms
125+
# try to align them
126+
for offset in range(1, max_offset+1):
127+
if offset >= min_length:
128+
break
129+
agree_pos = True
130+
agree_neg = True
131+
for i in range(min_length):
132+
if i+offset < len(terms) and terms[i + offset] != self.terms[i]:
133+
agree_pos = False
134+
if i+offset < len(self.terms) and terms[i] != self.terms[i+offset]:
135+
agree_neg = False
136+
if agree_pos:
137+
return terms[offset:]
138+
if agree_neg:
139+
result = self.terms[0:offset]
140+
result.extend(terms)
141+
return result
142+
return None

sample.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import os.path
22

3-
from loda.oeis import ProgramCache, sequence
3+
from loda.oeis import ProgramCache, Sequence
44
from loda.runtime import Evaluator, Interpreter
55
from loda.mine import Miner
66
from loda.ml.keras.program_generation_rnn import load_model, train_model, Generator
@@ -43,12 +43,12 @@ def mine(self):
4343
print(generator())
4444

4545
existing_ids = set(self.program_cache.all_ids())
46-
seqs = sequence.load(os.path.expanduser("~/loda/oeis"))
46+
seqs = Sequence.load_oeis(os.path.expanduser("~/loda/oeis"))
4747
seqs = list(filter(lambda s:
4848
len(s.terms) >= 8 and s.id not in existing_ids, seqs))
4949
print("Loaded {} sequences".format(len(seqs)))
5050
miner = Miner(seqs, self.interpreter, generator)
51-
for i in range(500):
51+
for i in range(20):
5252
miner()
5353
if i % 10 == 0:
5454
print(generator.get_stats_info_str())

tests/test_oeis.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from unittest import TestCase
22

3-
from loda.oeis import PrefixIndex, ProgramCache, Sequence, sequence
3+
from loda.oeis import PrefixIndex, ProgramCache, Sequence
44
from tests.helpers import OEIS_TEST_DIR, PROGRAMS_TEST_DIR
55

66
NUM_SEQS = 5
@@ -10,7 +10,7 @@
1010
class PrefixIndexTests(TestCase):
1111

1212
def setUp(self):
13-
seqs = sequence.load(OEIS_TEST_DIR)
13+
seqs = Sequence.load_oeis(OEIS_TEST_DIR)
1414
self.index = PrefixIndex(seqs)
1515

1616
def test_index_size(self):

0 commit comments

Comments
 (0)