-
Notifications
You must be signed in to change notification settings - Fork 80
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add option for SBT creation to be localized #925
Changes from all commits
586395b
e79b8ab
6a49712
b19fafa
c915e6f
e344cd6
6f08bad
0f5049c
0ea6da5
f92a782
e8708fe
5210a55
285eb04
217c1ba
71a6e6e
e081de0
79e8da0
fcf7b47
4995c55
a760477
3081f4e
dc115a1
b9a69b2
dbd234d
76f7f07
d730c6b
ffd418c
feb84c7
89b6a5d
c53dd5c
4d3d0a8
52bcd56
9a32c0f
42d2b6a
42c221f
dea3b48
6fc5a1a
2a5077b
b33b5d2
d499675
4346936
55779bc
6a052d9
e6d1dde
dc1d31b
dc31e9c
8c4a71a
b255cb6
4cbffdf
338fbac
fe4a956
832861c
b756008
c1bdfa6
512ebdf
6e90a8f
1c18072
516da88
acef608
cb6a6cd
0fb5caa
7e25102
12dec1f
a8cd435
55dd7d9
0d78ccd
24b854e
733d7f6
8be87ce
5385c34
50329f9
e5ac760
e5c48d5
459bf9f
8471fc1
36e8c3b
a4ffb66
84de401
b12e050
89e520c
9c04a42
f49f385
c0c9d4c
055685a
88f2801
f12506a
7ef803d
1663bcc
b9dcfde
438c3f5
aad7425
ac1ca07
7afc419
1e6d9c9
2c7b2fb
c661af7
baa5b04
4d1995e
05fbf59
92c8aa1
975ba88
0a6fefb
09f0456
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -345,10 +345,11 @@ def index(args): | |
set_quiet(args.quiet) | ||
moltype = sourmash_args.calculate_moltype(args) | ||
|
||
if args.append: | ||
tree = load_sbt_index(args.sbt_name) | ||
if args.traverse_directory: | ||
inp_files = list(sourmash_args.traverse_find_sigs(args.signatures, | ||
args.force)) | ||
else: | ||
tree = create_sbt_index(args.bf_size, n_children=args.n_children) | ||
inp_files = list(args.signatures) | ||
|
||
if args.sparseness < 0 or args.sparseness > 1.0: | ||
error('sparseness must be in range [0.0, 1.0].') | ||
|
@@ -357,31 +358,34 @@ def index(args): | |
args.scaled = int(args.scaled) | ||
notify('downsampling signatures to scaled={}', args.scaled) | ||
|
||
inp_files = list(args.signatures) | ||
if args.from_file: | ||
more_files = sourmash_args.load_file_list_of_signatures(args.from_file) | ||
inp_files.extend(more_files) | ||
notify('loading {} files into SBT', len(inp_files)) | ||
|
||
if not inp_files: | ||
error("ERROR: no files to index!? Supply on command line or use --from-file") | ||
sys.exit(-1) | ||
tree, n = load_matching_signatures_into_tree( | ||
inp_files, args.ksize, moltype, args.scaled, args.append, args.sbt_name, | ||
return_n=True) | ||
|
||
notify('loading {} files into SBT', len(inp_files)) | ||
notify('loaded {} sigs; saving SBT under "{}"', n, args.sbt_name) | ||
tree.save(args.sbt_name, sparseness=args.sparseness) | ||
|
||
progress = sourmash_args.SignatureLoadingProgress() | ||
|
||
def load_matching_signatures_into_tree(filenames, ksize, moltype, scaled=0, | ||
append=False, sbt_name=None, bf_size=1e5, | ||
n_children=2, return_n=False): | ||
if append: | ||
tree = load_sbt_index(sbt_name) | ||
else: | ||
tree = create_sbt_index(bf_size, n_children=n_children) | ||
|
||
n = 0 | ||
ksizes = set() | ||
moltypes = set() | ||
nums = set() | ||
scaleds = set() | ||
for f in inp_files: | ||
siglist = sourmash_args.load_file_as_signatures(f, | ||
ksize=args.ksize, | ||
select_moltype=moltype, | ||
traverse=args.traverse_directory, | ||
yield_all_files=args.force, | ||
progress=progress) | ||
for f in filenames: | ||
if n % 100 == 0: | ||
notify('\r...reading from {} ({} signatures so far)', f, n, end='') | ||
siglist = sig.load_signatures(f, ksize=ksize, | ||
select_moltype=moltype) | ||
|
||
# load all matching signatures in this file | ||
ss = None | ||
|
@@ -390,8 +394,8 @@ def index(args): | |
moltypes.add(sourmash_args.get_moltype(ss)) | ||
nums.add(ss.minhash.num) | ||
|
||
if args.scaled: | ||
ss.minhash = ss.minhash.downsample_scaled(args.scaled) | ||
if scaled: | ||
ss.minhash = ss.minhash.downsample_scaled(scaled) | ||
scaleds.add(ss.minhash.scaled) | ||
|
||
tree.insert(ss) | ||
|
@@ -400,32 +404,35 @@ def index(args): | |
if not ss: | ||
continue | ||
|
||
# check to make sure we aren't loading incompatible signatures | ||
if len(ksizes) > 1 or len(moltypes) > 1: | ||
error('multiple k-mer sizes or molecule types present; fail.') | ||
error('specify --dna/--protein and --ksize as necessary') | ||
error('ksizes: {}; moltypes: {}', | ||
", ".join(map(str, ksizes)), ", ".join(moltypes)) | ||
sys.exit(-1) | ||
|
||
if nums == { 0 } and len(scaleds) == 1: | ||
pass # good | ||
elif scaleds == { 0 } and len(nums) == 1: | ||
pass # also good | ||
else: | ||
error('trying to build an SBT with incompatible signatures.') | ||
error('nums = {}; scaleds = {}', repr(nums), repr(scaleds)) | ||
sys.exit(-1) | ||
|
||
check_signature_compatibilty_to_tree(ksizes, moltypes, nums, scaleds) | ||
notify('') | ||
|
||
# did we load any!? | ||
if n == 0: | ||
error('no signatures found to load into tree!? failing.') | ||
sys.exit(-1) | ||
if return_n: | ||
return tree, n | ||
else: | ||
return tree | ||
|
||
notify('loaded {} sigs; saving SBT under "{}"', n, args.sbt_name) | ||
tree.save(args.sbt_name, sparseness=args.sparseness) | ||
|
||
def check_signature_compatibilty_to_tree(ksizes, moltypes, nums, scaleds): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. in sourmash_args, we have There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (maybe prefix it with an _ to indicate that it's a helper function that shouldn't be used outside this module?) |
||
# check to make sure we aren't loading incompatible signatures | ||
if len(ksizes) > 1 or len(moltypes) > 1: | ||
error('multiple k-mer sizes or molecule types present; fail.') | ||
error('specify --dna/--protein and --ksize as necessary') | ||
error('ksizes: {}; moltypes: {}', | ||
", ".join(map(str, ksizes)), ", ".join(moltypes)) | ||
sys.exit(-1) | ||
if nums == {0} and len(scaleds) == 1: | ||
pass # good | ||
elif scaleds == {0} and len(nums) == 1: | ||
pass # also good | ||
else: | ||
error('trying to build an SBT with incompatible signatures.') | ||
error('nums = {}; scaleds = {}', repr(nums), repr(scaleds)) | ||
sys.exit(-1) | ||
|
||
|
||
def search(args): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think this option is necessary: if
d != 2
then build the currentSBT
, otherwise always build theLocalizedSBT
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
well, it's nice for debugging :)