Skip to content

Commit b03e76a

Browse files
authored
Merge pull request #3 from databio/core_utils
Core utilities
2 parents 8854edd + 3c2162b commit b03e76a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+1529
-74
lines changed

bindings/Cargo.toml

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "genimtools-py"
3-
version = "0.0.4"
3+
version = "0.0.7"
44
edition = "2021"
55

66
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
@@ -9,5 +9,5 @@ name = "genimtools"
99
crate-type = ["cdylib"]
1010

1111
[dependencies]
12-
pyo3 = "0.19.0"
1312
genimtools = { path = "../genimtools" }
13+
pyo3 = "0.20.0"

bindings/README.md

+2
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,5 @@ import genimtools as gt
1414

1515
gt.prune_universe(...)
1616
```
17+
## Developer docs
18+
Write the develop docs here...

bindings/genimtools/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .genimtools import *

bindings/genimtools/__init__.pyi

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
PAD_CHR: str
2+
PAD_START: int
3+
PAD_END: int
4+
5+
UNKNOWN_CHR: str
6+
UNKNOWN_START: int
7+
UNKNOWN_END: int
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .genimtools.tokenizers import *
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
from typing import List
2+
3+
class Region:
4+
chr: str
5+
start: int
6+
end: int
7+
8+
class TokenizedRegion:
9+
chr: str
10+
start: int
11+
end: int
12+
id: int
13+
14+
class TokenizedRegionSet:
15+
def regions(self) -> List[TokenizedRegion]:
16+
"""
17+
Get the list of regions in this set.
18+
"""
19+
20+
def ids(self) -> List[int]:
21+
"""
22+
Get the list of IDs in this set.
23+
"""
24+
25+
class TreeTokenizer:
26+
def __init__(self, universe: str) -> None:
27+
"""
28+
Instatiate a new TreeTokenizer. This tokenizer takes advantage of
29+
interval trees to compute genomic interval overlaps efficiently.
30+
31+
:param universe: The universe of characters to use for tokenization.
32+
"""
33+
pass
34+
35+
def tokenize(self, regions: List[Region]) -> TokenizedRegionSet:
36+
"""
37+
Tokenize a list of regions.
38+
"""

bindings/genimtools/vocab/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .genimtools.vocab import *
+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
from typing import Optional
2+
3+
def prune_universe(data: str, universe: str, min_count: Optional[int], output: Optional[str]) -> None:
4+
"""
5+
Given a universe and a path to its corresponding data, "prune" the universe to remove regions that
6+
are not present in the data. This is useful for reducing the size of the universe to remove unused
7+
regions. It also can be used to remove regions that fall below a certain frequency threshold.
8+
9+
:param str data: The path to the data file.
10+
:param str universe: The path to the universe file.
11+
:param int min_count: The minimum number of times a region must appear in the data to be kept.
12+
:param str output: The path to the output file.
13+
"""

bindings/pyproject.toml

-2
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,6 @@ dynamic = ["version"]
1414

1515
[tool.maturin]
1616
features = ["pyo3/extension-module"]
17-
python-source = "python" # necessary to distract maturin from the stub files
18-
module-name = "my_project._lib_name"
1917

2018
[tool.setuptools.dynamic]
2119
readme = {file = "README.md", content-type = "text/markdown"}

bindings/src/consts.rs

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
pub use genimtools::common::consts::*;

bindings/src/lib.rs

+22-18
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,38 @@
11
use pyo3::prelude::*;
22
use pyo3::types::PyDict;
33

4-
use crate::functions::{prune_universe, print_uniwig};
4+
mod vocab;
5+
mod tokenizers;
6+
mod models;
7+
mod consts;
58

6-
mod functions;
9+
pub const VERSION: &str = env!("CARGO_PKG_VERSION");
710

811
#[pymodule]
9-
fn vocab(_py: Python, m: &PyModule) -> PyResult<()> {
10-
m.add_wrapped(wrap_pyfunction!(prune_universe))?;
11-
Ok(())
12-
}
13-
14-
#[pymodule]
15-
fn uniwig(_py: Python, m: &PyModule) -> PyResult<()> {
16-
m.add_wrapped(wrap_pyfunction!(print_uniwig))?;
17-
Ok(())
18-
}
12+
fn genimtools(py: Python, m: &PyModule) -> PyResult<()> {
1913

20-
#[pymodule]
21-
fn genimtools(py: Python, m: &PyModule) -> PyResult<()> {
22-
let vocab_module = pyo3::wrap_pymodule!(vocab);
23-
let uniwig_module = pyo3::wrap_pymodule!(uniwig);
14+
let vocab_module = pyo3::wrap_pymodule!(vocab::vocab);
15+
let tokenize_module = pyo3::wrap_pymodule!(tokenizers::tokenizers);
2416

2517
m.add_wrapped(vocab_module)?;
26-
m.add_wrapped(uniwig_module)?;
18+
m.add_wrapped(tokenize_module)?;
2719

2820
let sys = PyModule::import(py, "sys")?;
2921
let sys_modules: &PyDict = sys.getattr("modules")?.downcast()?;
22+
23+
// set names of submodules
3024
sys_modules.set_item("genimtools.vocab", m.getattr("vocab")?)?;
31-
sys_modules.set_item("genimtools.uniwig", m.getattr("uniwig")?)?;
25+
sys_modules.set_item("genimtools.tokenizers", m.getattr("tokenizers")?)?;
26+
27+
// add constants
28+
m.add("PAD_CHR", consts::PAD_CHR)?;
29+
m.add("PAD_START", consts::PAD_START)?;
30+
m.add("PAD_END", consts::PAD_END)?;
31+
m.add("UNKNOWN_CHR", consts::UNKNOWN_CHR)?;
32+
m.add("UNKNOWN_START", consts::UNKNOWN_START)?;
33+
m.add("UNKNOWN_END", consts::UNKNOWN_END)?;
34+
m.add("__version__", VERSION)?;
35+
3236
Ok(())
3337
}
3438

bindings/src/models/mod.rs

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
mod region;
2+
mod region_set;
3+
mod universe;
4+
5+
pub use self::region::{PyRegion, PyTokenizedRegion};
6+
pub use self::region_set::PyTokenizedRegionSet;
7+
pub use self::universe::PyUniverse;

bindings/src/models/region.rs

+112
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
use std::hash::Hash;
2+
3+
use pyo3::class::basic::CompareOp;
4+
use pyo3::exceptions::PyTypeError;
5+
use pyo3::prelude::*;
6+
7+
use genimtools::common::models::region::Region;
8+
9+
#[pyclass(name = "Region")]
10+
#[derive(Clone, Debug, Hash, Eq, PartialEq)]
11+
pub struct PyRegion {
12+
pub chr: String,
13+
pub start: u32,
14+
pub end: u32,
15+
}
16+
17+
impl PyRegion {
18+
pub fn to_region(&self) -> Region {
19+
Region {
20+
chr: self.chr.clone(),
21+
start: self.start,
22+
end: self.end,
23+
}
24+
}
25+
}
26+
27+
#[pymethods]
28+
impl PyRegion {
29+
#[new]
30+
pub fn new(chr: String, start: u32, end: u32) -> Self {
31+
PyRegion { chr, start, end }
32+
}
33+
34+
#[getter]
35+
pub fn chr(&self) -> PyResult<&str> {
36+
Ok(&self.chr)
37+
}
38+
39+
#[getter]
40+
pub fn start(&self) -> PyResult<u32> {
41+
Ok(self.start)
42+
}
43+
44+
#[getter]
45+
pub fn end(&self) -> PyResult<u32> {
46+
Ok(self.end)
47+
}
48+
pub fn __repr__(&self) -> String {
49+
format!("Region({}, {}, {})", self.chr, self.start, self.end)
50+
}
51+
52+
pub fn __richcmp__(&self, other: PyRef<PyRegion>, op: CompareOp) -> PyResult<bool> {
53+
match op {
54+
CompareOp::Eq => {
55+
Ok(self.chr == other.chr && self.start == other.start && self.end == other.end)
56+
}
57+
CompareOp::Ne => {
58+
Ok(self.chr != other.chr || self.start != other.start || self.end != other.end)
59+
}
60+
_ => Err(PyTypeError::new_err("Unsupported comparison operator")),
61+
}
62+
}
63+
}
64+
65+
#[pyclass(name = "TokenizedRegion")]
66+
#[derive(Clone, Debug)]
67+
pub struct PyTokenizedRegion {
68+
pub region: PyRegion,
69+
pub id: u32,
70+
}
71+
72+
#[pymethods]
73+
impl PyTokenizedRegion {
74+
#[new]
75+
pub fn new(region: PyRegion, id: u32) -> Self {
76+
PyTokenizedRegion {
77+
region,
78+
id,
79+
}
80+
}
81+
82+
#[getter]
83+
pub fn chr(&self) -> PyResult<&str> {
84+
Ok(&self.region.chr)
85+
}
86+
87+
#[getter]
88+
pub fn start(&self) -> PyResult<u32> {
89+
Ok(self.region.start)
90+
}
91+
92+
#[getter]
93+
pub fn end(&self) -> PyResult<u32> {
94+
Ok(self.region.end)
95+
}
96+
97+
#[getter]
98+
pub fn region(&self) -> PyResult<PyRegion> {
99+
Ok(self.region.clone())
100+
}
101+
#[getter]
102+
pub fn id(&self) -> PyResult<u32> {
103+
Ok(self.id)
104+
}
105+
106+
pub fn __repr__(&self) -> String {
107+
format!(
108+
"TokenizedRegion({}, {}, {})",
109+
self.region.chr, self.region.start, self.region.end
110+
)
111+
}
112+
}

bindings/src/models/region_set.rs

+92
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
use pyo3::exceptions::PyIndexError;
2+
use pyo3::prelude::*;
3+
4+
use genimtools::common::consts::{PAD_CHR, PAD_START, PAD_END};
5+
6+
use crate::models::{PyRegion, PyTokenizedRegion};
7+
8+
#[pyclass(name = "TokenizedRegionSet")]
9+
#[derive(Clone, Debug)]
10+
pub struct PyTokenizedRegionSet {
11+
pub regions: Vec<PyRegion>,
12+
pub ids: Vec<u32>,
13+
curr: usize,
14+
}
15+
16+
#[pymethods]
17+
impl PyTokenizedRegionSet {
18+
#[new]
19+
pub fn new(regions: Vec<PyRegion>, ids: Vec<u32>) -> Self {
20+
PyTokenizedRegionSet {
21+
regions,
22+
ids,
23+
curr: 0,
24+
}
25+
}
26+
27+
#[getter]
28+
pub fn regions(&self) -> PyResult<Vec<PyRegion>> {
29+
Ok(self.regions.to_owned())
30+
}
31+
32+
#[getter]
33+
pub fn ids(&self) -> PyResult<Vec<u32>> {
34+
Ok(self.ids.clone())
35+
}
36+
37+
// this is wrong: the padding token might not be in the universe
38+
pub fn pad(&mut self, len: usize) {
39+
let pad_region = PyRegion {
40+
chr: PAD_CHR.to_string(),
41+
start: PAD_START as u32,
42+
end: PAD_END as u32,
43+
};
44+
let pad_id = self.ids[0];
45+
let pad_region_set = PyTokenizedRegionSet {
46+
regions: vec![pad_region; len],
47+
ids: vec![pad_id; len],
48+
curr: 0,
49+
};
50+
self.regions.extend(pad_region_set.regions);
51+
self.ids.extend(pad_region_set.ids);
52+
}
53+
54+
pub fn __repr__(&self) -> String {
55+
format!("TokenizedRegionSet({} regions)", self.regions.len())
56+
}
57+
58+
pub fn __len__(&self) -> usize {
59+
self.regions.len()
60+
}
61+
62+
fn __iter__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> {
63+
slf
64+
}
65+
66+
pub fn __next__(&mut self) -> Option<PyTokenizedRegion> {
67+
if self.curr < self.regions.len() {
68+
let region = self.regions[self.curr].clone();
69+
let id = self.ids[self.curr];
70+
71+
self.curr += 1;
72+
Some(PyTokenizedRegion::new(region, id))
73+
} else {
74+
None
75+
}
76+
}
77+
78+
pub fn __getitem__(&self, indx: isize) -> PyResult<PyTokenizedRegion> {
79+
let indx = if indx < 0 {
80+
self.regions.len() as isize + indx
81+
} else {
82+
indx
83+
};
84+
if indx < 0 || indx >= self.regions.len() as isize {
85+
Err(PyIndexError::new_err("Index out of bounds"))
86+
} else {
87+
let region = self.regions[indx as usize].clone();
88+
let id = self.ids[indx as usize];
89+
Ok(PyTokenizedRegion::new(region, id))
90+
}
91+
}
92+
}

0 commit comments

Comments
 (0)