Skip to content

Commit dab4beb

Browse files
authored
Merge pull request #22 from databio/performance
Fix pything bindings tokenization performance issues
2 parents 539e35f + 6cead90 commit dab4beb

File tree

8 files changed

+102
-52
lines changed

8 files changed

+102
-52
lines changed

bindings/Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "genimtools-py"
3-
version = "0.0.11"
3+
version = "0.0.12"
44
edition = "2021"
55

66
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

bindings/src/models/region.rs

+39-6
Original file line numberDiff line numberDiff line change
@@ -79,34 +79,67 @@ impl PyRegion {
7979
#[derive(Clone, Debug)]
8080
pub struct PyTokenizedRegion {
8181
pub id: u32,
82-
pub universe: PyUniverse,
82+
pub universe: Py<PyUniverse>,
8383
}
8484

8585
impl From<PyTokenizedRegion> for PyRegion {
8686
fn from(value: PyTokenizedRegion) -> Self {
87-
value.universe.convert_id_to_region(value.id).unwrap()
87+
Python::with_gil(|py| {
88+
value
89+
.universe
90+
.borrow(py)
91+
.convert_id_to_region(value.id)
92+
.unwrap()
93+
})
8894
}
8995
}
9096

9197
#[pymethods]
9298
impl PyTokenizedRegion {
9399
#[getter]
94100
pub fn chr(&self) -> Result<String> {
95-
Ok(self.universe.convert_id_to_region(self.id).unwrap().chr)
101+
Python::with_gil(|py| {
102+
Ok(self
103+
.universe
104+
.borrow(py)
105+
.convert_id_to_region(self.id)
106+
.unwrap()
107+
.chr)
108+
})
96109
}
97110

98111
#[getter]
99112
pub fn start(&self) -> Result<u32> {
100-
Ok(self.universe.convert_id_to_region(self.id).unwrap().start)
113+
Python::with_gil(|py| {
114+
Ok(self
115+
.universe
116+
.borrow(py)
117+
.convert_id_to_region(self.id)
118+
.unwrap()
119+
.start)
120+
})
101121
}
102122

103123
#[getter]
104124
pub fn end(&self) -> Result<u32> {
105-
Ok(self.universe.convert_id_to_region(self.id).unwrap().end)
125+
Python::with_gil(|py| {
126+
Ok(self
127+
.universe
128+
.borrow(py)
129+
.convert_id_to_region(self.id)
130+
.unwrap()
131+
.end)
132+
})
106133
}
107134

108135
pub fn to_region(&self) -> Result<PyRegion> {
109-
Ok(self.universe.convert_id_to_region(self.id).unwrap())
136+
Python::with_gil(|py| {
137+
Ok(self
138+
.universe
139+
.borrow(py)
140+
.convert_id_to_region(self.id)
141+
.unwrap())
142+
})
110143
}
111144
#[getter]
112145
pub fn id(&self) -> Result<u32> {

bindings/src/models/region_set.rs

+30-34
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use numpy::ndarray::Array;
66
use numpy::{IntoPyArray, PyArray1};
77

88
use anyhow::Result;
9-
use genimtools::common::{models::TokenizedRegionSet, utils::extract_regions_from_bed_file};
9+
use genimtools::common::utils::extract_regions_from_bed_file;
1010

1111
use crate::models::{PyRegion, PyTokenizedRegion, PyUniverse};
1212

@@ -89,18 +89,8 @@ impl PyRegionSet {
8989
#[derive(Clone, Debug)]
9090
pub struct PyTokenizedRegionSet {
9191
pub ids: Vec<u32>,
92-
pub universe: PyUniverse,
93-
curr: usize,
94-
}
95-
96-
impl From<TokenizedRegionSet<'_>> for PyTokenizedRegionSet {
97-
fn from(value: TokenizedRegionSet) -> Self {
98-
PyTokenizedRegionSet {
99-
ids: value.ids,
100-
universe: value.universe.to_owned().into(),
101-
curr: 0,
102-
}
103-
}
92+
pub universe: Py<PyUniverse>,
93+
pub curr: usize,
10494
}
10595

10696
#[pymethods]
@@ -117,21 +107,25 @@ impl PyTokenizedRegionSet {
117107
}
118108

119109
pub fn to_bit_vector(&self) -> Result<Vec<u8>> {
120-
let mut bit_vector = vec![0; self.universe.id_to_region.len()];
110+
Python::with_gil(|py| {
111+
let mut bit_vector = vec![0; self.universe.borrow(py).id_to_region.len()];
121112

122-
for id in &self.ids {
123-
bit_vector[*id as usize] = 1;
124-
}
113+
for id in &self.ids {
114+
bit_vector[*id as usize] = 1;
115+
}
125116

126-
Ok(bit_vector)
117+
Ok(bit_vector)
118+
})
127119
}
128120

129121
pub fn to_regions(&self) -> Result<Vec<PyRegion>> {
130-
Ok(self
131-
.ids
132-
.iter()
133-
.map(|id| self.universe.id_to_region[&id].clone())
134-
.collect())
122+
Python::with_gil(|py| {
123+
Ok(self
124+
.ids
125+
.iter()
126+
.map(|id| self.universe.borrow(py).id_to_region[&id].clone())
127+
.collect())
128+
})
135129
}
136130

137131
pub fn to_ids(&self) -> Result<Vec<u32>> {
@@ -164,17 +158,19 @@ impl PyTokenizedRegionSet {
164158
}
165159

166160
pub fn __next__(&mut self) -> Option<PyTokenizedRegion> {
167-
if self.curr < self.ids.len() {
168-
let id = self.ids[self.curr];
169-
self.curr += 1;
170-
171-
Some(PyTokenizedRegion {
172-
universe: self.universe.to_owned(),
173-
id,
174-
})
175-
} else {
176-
None
177-
}
161+
Python::with_gil(|py| {
162+
if self.curr < self.ids.len() {
163+
let id = self.ids[self.curr];
164+
self.curr += 1;
165+
166+
Some(PyTokenizedRegion {
167+
universe: self.universe.clone_ref(py),
168+
id,
169+
})
170+
} else {
171+
None
172+
}
173+
})
178174
}
179175

180176
pub fn __getitem__(&self, indx: isize) -> Result<PyTokenizedRegion> {

bindings/src/models/universe.rs

+6-5
Original file line numberDiff line numberDiff line change
@@ -17,17 +17,18 @@ pub struct PyUniverse {
1717

1818
impl From<Universe> for PyUniverse {
1919
fn from(value: Universe) -> Self {
20-
let regions = value.regions.iter().map(|r| r.to_owned().into()).collect();
20+
let regions = value.regions.into_iter().map(|r| r.into()).collect();
2121

2222
let region_to_id = value
2323
.region_to_id
24-
.iter()
25-
.map(|(k, v)| (k.to_owned().into(), *v))
24+
.into_iter()
25+
.map(|(k, v)| (k.into(), v))
2626
.collect();
27+
2728
let id_to_region = value
2829
.id_to_region
29-
.iter()
30-
.map(|(k, v)| (*k, v.to_owned().into()))
30+
.into_iter()
31+
.map(|(k, v)| (k, v.into()))
3132
.collect();
3233

3334
PyUniverse {

bindings/src/tokenizers/tree_tokenizer.rs

+21-4
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,24 @@ use crate::utils::extract_regions_from_py_any;
1515
#[pyclass(name = "TreeTokenizer")]
1616
pub struct PyTreeTokenizer {
1717
pub tokenizer: TreeTokenizer,
18+
pub universe: Py<PyUniverse>, // this is a Py-wrapped version self.tokenizer.universe for performance reasons
1819
}
1920

2021
#[pymethods]
2122
impl PyTreeTokenizer {
2223
#[new]
2324
pub fn new(path: String) -> Result<Self> {
24-
let path = Path::new(&path);
25-
let tokenizer = TreeTokenizer::try_from(path)?;
25+
Python::with_gil(|py| {
26+
let path = Path::new(&path);
27+
let tokenizer = TreeTokenizer::try_from(path)?;
28+
let py_universe: PyUniverse = tokenizer.universe.to_owned().into();
29+
let py_universe_bound = Py::new(py, py_universe)?;
2630

27-
Ok(PyTreeTokenizer { tokenizer })
31+
Ok(PyTreeTokenizer {
32+
tokenizer,
33+
universe: py_universe_bound,
34+
})
35+
})
2836
}
2937

3038
#[getter]
@@ -138,7 +146,16 @@ impl PyTreeTokenizer {
138146
// tokenize the RegionSet
139147
let tokenized = self.tokenizer.tokenize_region_set(&rs);
140148

141-
Ok(tokenized.into())
149+
Python::with_gil(|py| {
150+
let py_tokenized_region_set = PyTokenizedRegionSet {
151+
ids: tokenized.ids,
152+
curr: 0,
153+
universe: self.universe.clone_ref(py),
154+
};
155+
156+
Ok(py_tokenized_region_set)
157+
})
158+
142159
}
143160

144161
// encode returns a list of ids

genimtools/Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "genimtools"
3-
version = "0.0.11"
3+
version = "0.0.12"
44
edition = "2021"
55
description = "Performance-critical tools to manipulate, analyze, and process genomic interval data. Primarily focused on building tools for geniml - our genomic machine learning python package."
66
license = "MIT"

genimtools/docs/changelog.md

+3
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@ All notable changes to this project will be documented in this file.
44
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
55
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
66

7+
## [0.0.12]
8+
- optimize creation of `PyRegionSet` to reduce expensive cloning of `Universe` structs.
9+
710
## [0.0.11]
811
- redesigned API for the tokenizers to better emulate the huggingface tokenizers API.
912
- implemented new traits for tokenizers to allow for more flexibility when creating new tokenizers.

genimtools/src/common/models/universe.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -82,4 +82,4 @@ impl TryFrom<&Path> for Universe {
8282
id_to_region,
8383
})
8484
}
85-
}
85+
}

0 commit comments

Comments
 (0)