Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add control flow for pre-sorted bed files #69

Closed
donaldcampbelljr opened this issue Jan 21, 2025 · 3 comments
Closed

Add control flow for pre-sorted bed files #69

donaldcampbelljr opened this issue Jan 21, 2025 · 3 comments
Labels

Comments

@donaldcampbelljr
Copy link
Member

No description provided.

@donaldcampbelljr
Copy link
Member Author

donaldcampbelljr commented Jan 21, 2025

From most recent uniwig commits, it appears that, even if the flag sorted is set to true, we sort starts and ends as the program reads the bed file into memory:

        std::vector<chromosome> chromosomes;
        chromosomes = read_bed_vec(bedPath);
std::vector<chromosome> read_bed_vec(const char *bedPath)
{
    //vector of vector of regions to store regions in one vector per chromosome
    //std::cout << "\nInput file: " << bedPath << "\n";
    std::cout << "\nReading chromosomes" << std::endl;
    gzFile fp;
    kstream_t *ks;
    kstring_t str = {0, 0, 0};
    // int32_t k = 0;
    fp = bedPath && strcmp(bedPath, "-") ? gzopen(bedPath, "r") : gzdopen(0, "r");
    if (fp == 0)
    {
        fprintf(stderr, "ERROR: failed to open the input file\n");
        exit(1);
    }
    ks = ks_init(fp);
    chromosome chr;
    char chrom[100] = "";
    std::vector<chromosome> chromosomes;
    while (ks_getuntil(ks, KS_SEP_LINE, &str, 0) >= 0)
    {
        char *ctg, *rest;
        int32_t st, en;
        ctg = parse_bed(str.s, &st, &en, &rest);
        //std:: cout << "\n" << ctg << "\t" << st << "\t" << en;
        if (strcmp(chrom, "") == 0)
        {
            strcpy(chrom, ctg);
            chr.chrom = std::string(chrom);
            chr.starts.push_back(st);
            chr.ends.push_back(en);
            continue;
        }
        if (ctg)
        {
            if(strcmp(chrom, ctg) != 0)
            {
                // std::cout << "\nI'm here, chrom = " << chrom << ", ctg = " << ctg <<  ", compare result: "<< strcmp(chrom, ctg) <<"\n";
                kx::radix_sort(chr.starts.begin(),chr.starts.end());
                kx::radix_sort(chr.ends.begin(),chr.ends.end());
                chromosomes.push_back(chr);

                strcpy(chrom, ctg);
                chr.chrom = std::string(chrom);
                std::vector<int> start;
                std::vector<int> end;
                chr.ends = end;
                chr.starts = start;
            }
            chr.starts.push_back(st);
            chr.ends.push_back(en);
        }
    }

    // sort the starts and ends respectively
    kx::radix_sort(chr.starts.begin(),chr.starts.end());
    kx::radix_sort(chr.ends.begin(),chr.ends.end());
    chromosomes.push_back(chr);

    std::cout << "Reading finished\n" << std::endl;
    free(str.s);
    ks_destroy(ks);
    gzclose(fp);
    return chromosomes;
}

If sorted is false, the program creates a mapping:

        std::map<std::string, chromosome> chromosomes;
        chromosomes = read_bed_map(bedPath);

where read_bed_map:

std::map<std::string, chromosome> read_bed_map(const char *bedPath)
{
    //vector of vector of regions to store regions in one vector per chromosome
    std::cout << "\nReading chromosomes" << std::endl;
    //std::cout << "\nInput file: " << bedPath << "\n";
    gzFile fp;
    kstream_t *ks;
    kstring_t str = {0, 0, 0};
    fp = bedPath && strcmp(bedPath, "-") ? gzopen(bedPath, "r") : gzdopen(0, "r");
    if (fp == 0)
    {
        fprintf(stderr, "ERROR: failed to open the input file\n");
        exit(1);
    }
    ks = ks_init(fp);
    // chromosome chr;
    char chrom[100] = "";
    std::map<std::string, chromosome> chromosomes;
    // std::vector<chromosome> chromosomes;

    while (ks_getuntil(ks, KS_SEP_LINE, &str, 0) >= 0)
    {
        char *ctg, *rest;
        int32_t st, en;
        ctg = parse_bed(str.s, &st, &en, &rest);

        // std:: cout << "\n" << ctg << "\t" << st << "\t" << en;

        if (ctg) {
            if (strcmp(chrom, ctg) != 0) {
                strcpy(chrom, ctg);
                if (chromosomes.find(chrom) == chromosomes.end()) {
                    chromosome chr;
                    chr.chrom = std::string(chrom);
                    // fprintf(stderr, "Creating a new chromosome: %s\n", chr.chrom.c_str());
                    chromosomes.insert(std::pair<std::string, chromosome>(chrom, chr));
                }
            }
            chromosomes[chrom].starts.push_back(st);
            chromosomes[chrom].ends.push_back(en);
        }
    }

    // sort the starts and ends respectively
    for (std::map<std::string, chromosome>::iterator it=chromosomes.begin(); it!=chromosomes.end(); it++) {
        kx::radix_sort(it->second.starts.begin(),it->second.starts.end());
        kx::radix_sort(it->second.ends.begin(),it->second.ends.end());
    }


    std::cout << "Reading finished\n" << std::endl;
    free(str.s);
    ks_destroy(ks);
    gzclose(fp);
    return chromosomes;
}

This function also sorts the starts and ends in memory.

TODO:

donaldcampbelljr added a commit that referenced this issue Jan 21, 2025
@donaldcampbelljr
Copy link
Member Author

This may be superseded by #72

@donaldcampbelljr
Copy link
Member Author

Closing for now as building with release flag helped immensely.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

No branches or pull requests

1 participant