Skip to content

Commit c5b524b

Browse files
committed
gtok file format
1 parent a5047e0 commit c5b524b

File tree

8 files changed

+45
-2
lines changed

8 files changed

+45
-2
lines changed

bindings/Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "genimtools-py"
3-
version = "0.0.8"
3+
version = "0.0.9"
44
edition = "2021"
55

66
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

bindings/genimtools/utils/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .genimtools.utils import *
+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
from typing import List
2+
3+
def write_tokens_to_gtok(filename: str, tokens: List[int]) -> None:
4+
"""
5+
Write a list of tokens to a gtok file.
6+
"""
7+
8+
def read_tokens_from_gtok(filename: str) -> List[int]:
9+
"""
10+
Read a list of tokens from a gtok file.
11+
"""

bindings/src/lib.rs

+5
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ mod consts;
66
mod models;
77
mod tokenizers;
88
mod vocab;
9+
mod utils;
910

1011
pub const VERSION: &str = env!("CARGO_PKG_VERSION");
1112

@@ -14,9 +15,12 @@ fn genimtools(py: Python, m: &PyModule) -> PyResult<()> {
1415
let vocab_module = pyo3::wrap_pymodule!(vocab::vocab);
1516
let tokenize_module = pyo3::wrap_pymodule!(tokenizers::tokenizers);
1617
let ailist_module = pyo3::wrap_pymodule!(ailist::ailist);
18+
let utils_module = pyo3::wrap_pymodule!(utils::utils);
19+
1720
m.add_wrapped(vocab_module)?;
1821
m.add_wrapped(tokenize_module)?;
1922
m.add_wrapped(ailist_module)?;
23+
m.add_wrapped(utils_module)?;
2024

2125
let sys = PyModule::import(py, "sys")?;
2226
let sys_modules: &PyDict = sys.getattr("modules")?.downcast()?;
@@ -25,6 +29,7 @@ fn genimtools(py: Python, m: &PyModule) -> PyResult<()> {
2529
sys_modules.set_item("genimtools.vocab", m.getattr("vocab")?)?;
2630
sys_modules.set_item("genimtools.tokenizers", m.getattr("tokenizers")?)?;
2731
sys_modules.set_item("genimtools.ailist", m.getattr("ailist")?)?;
32+
sys_modules.set_item("genimtools.utils", m.getattr("utils")?)?;
2833

2934
// add constants
3035
m.add("PAD_CHR", consts::PAD_CHR)?;

bindings/src/models/region_set.rs

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ use genimtools::common::consts::{PAD_CHR, PAD_END, PAD_START};
55

66
use crate::models::{PyRegion, PyTokenizedRegion};
77

8+
89
#[pyclass(name = "TokenizedRegionSet")]
910
#[derive(Clone, Debug)]
1011
pub struct PyTokenizedRegionSet {

bindings/src/utils/mod.rs

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
use pyo3::prelude::*;
2+
3+
4+
#[pyfunction]
5+
pub fn write_tokens_to_gtok(filename: &str, tokens: Vec<u32>) -> PyResult<()> {
6+
genimtools::io::write_tokens_to_gtok(filename, &tokens)?;
7+
Ok(())
8+
}
9+
10+
#[pyfunction]
11+
pub fn read_tokens_from_gtok(filename: &str) -> PyResult<Vec<u32>> {
12+
let tokens = genimtools::io::read_tokens_from_gtok(filename)?;
13+
Ok(tokens)
14+
}
15+
16+
#[pymodule]
17+
pub fn utils(_py: Python, m: &PyModule) -> PyResult<()> {
18+
m.add_wrapped(wrap_pyfunction!(write_tokens_to_gtok))?;
19+
m.add_wrapped(wrap_pyfunction!(read_tokens_from_gtok))?;
20+
Ok(())
21+
}

genimtools/Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "genimtools"
3-
version = "0.0.8"
3+
version = "0.0.9"
44
edition = "2021"
55
description = "Performance-critical tools to manipulate, analyze, and process genomic interval data. Primarily focused on building tools for geniml - our genomic machine learning python package."
66
license = "MIT"

genimtools/docs/changelog.md

+4
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
44
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
55
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
66

7+
## [0.0.9]
8+
- start working on the concept of a `.gtok` file-format to store tokenized regions
9+
- - added basic readers and writers for this format
10+
711
## [0.0.8]
812
- add a new `ids_as_strs` getter to the `TokenizedRegionSet` struct so that we can get the ids as strings quickly, this is meant mostly for interface with geniml.
913

0 commit comments

Comments
 (0)