diff --git a/dev/.documenter-siteinfo.json b/dev/.documenter-siteinfo.json index 2fcb78ff..ee5ee41d 100644 --- a/dev/.documenter-siteinfo.json +++ b/dev/.documenter-siteinfo.json @@ -1 +1 @@ -{"documenter":{"julia_version":"1.11.1","generation_timestamp":"2024-11-12T02:33:43","documenter_version":"1.7.0"}} \ No newline at end of file +{"documenter":{"julia_version":"1.11.1","generation_timestamp":"2024-11-13T02:35:20","documenter_version":"1.8.0"}} \ No newline at end of file diff --git a/dev/assets/documenter.js b/dev/assets/documenter.js index 82252a11..7d68cd80 100644 --- a/dev/assets/documenter.js +++ b/dev/assets/documenter.js @@ -612,176 +612,194 @@ function worker_function(documenterSearchIndex, documenterBaseURL, filters) { }; } -// `worker = Threads.@spawn worker_function(documenterSearchIndex)`, but in JavaScript! -const filters = [ - ...new Set(documenterSearchIndex["docs"].map((x) => x.category)), -]; -const worker_str = - "(" + - worker_function.toString() + - ")(" + - JSON.stringify(documenterSearchIndex["docs"]) + - "," + - JSON.stringify(documenterBaseURL) + - "," + - JSON.stringify(filters) + - ")"; -const worker_blob = new Blob([worker_str], { type: "text/javascript" }); -const worker = new Worker(URL.createObjectURL(worker_blob)); - /////// SEARCH MAIN /////// -// Whether the worker is currently handling a search. This is a boolean -// as the worker only ever handles 1 or 0 searches at a time. -var worker_is_running = false; - -// The last search text that was sent to the worker. This is used to determine -// if the worker should be launched again when it reports back results. -var last_search_text = ""; - -// The results of the last search. This, in combination with the state of the filters -// in the DOM, is used compute the results to display on calls to update_search. -var unfiltered_results = []; - -// Which filter is currently selected -var selected_filter = ""; - -$(document).on("input", ".documenter-search-input", function (event) { - if (!worker_is_running) { - launch_search(); - } -}); - -function launch_search() { - worker_is_running = true; - last_search_text = $(".documenter-search-input").val(); - worker.postMessage(last_search_text); -} - -worker.onmessage = function (e) { - if (last_search_text !== $(".documenter-search-input").val()) { - launch_search(); - } else { - worker_is_running = false; - } - - unfiltered_results = e.data; - update_search(); -}; +function runSearchMainCode() { + // `worker = Threads.@spawn worker_function(documenterSearchIndex)`, but in JavaScript! + const filters = [ + ...new Set(documenterSearchIndex["docs"].map((x) => x.category)), + ]; + const worker_str = + "(" + + worker_function.toString() + + ")(" + + JSON.stringify(documenterSearchIndex["docs"]) + + "," + + JSON.stringify(documenterBaseURL) + + "," + + JSON.stringify(filters) + + ")"; + const worker_blob = new Blob([worker_str], { type: "text/javascript" }); + const worker = new Worker(URL.createObjectURL(worker_blob)); + + // Whether the worker is currently handling a search. This is a boolean + // as the worker only ever handles 1 or 0 searches at a time. + var worker_is_running = false; + + // The last search text that was sent to the worker. This is used to determine + // if the worker should be launched again when it reports back results. + var last_search_text = ""; + + // The results of the last search. This, in combination with the state of the filters + // in the DOM, is used compute the results to display on calls to update_search. + var unfiltered_results = []; + + // Which filter is currently selected + var selected_filter = ""; + + $(document).on("input", ".documenter-search-input", function (event) { + if (!worker_is_running) { + launch_search(); + } + }); -$(document).on("click", ".search-filter", function () { - if ($(this).hasClass("search-filter-selected")) { - selected_filter = ""; - } else { - selected_filter = $(this).text().toLowerCase(); + function launch_search() { + worker_is_running = true; + last_search_text = $(".documenter-search-input").val(); + worker.postMessage(last_search_text); } - // This updates search results and toggles classes for UI: - update_search(); -}); + worker.onmessage = function (e) { + if (last_search_text !== $(".documenter-search-input").val()) { + launch_search(); + } else { + worker_is_running = false; + } -/** - * Make/Update the search component - */ -function update_search() { - let querystring = $(".documenter-search-input").val(); + unfiltered_results = e.data; + update_search(); + }; - if (querystring.trim()) { - if (selected_filter == "") { - results = unfiltered_results; + $(document).on("click", ".search-filter", function () { + if ($(this).hasClass("search-filter-selected")) { + selected_filter = ""; } else { - results = unfiltered_results.filter((result) => { - return selected_filter == result.category.toLowerCase(); - }); + selected_filter = $(this).text().toLowerCase(); } - let search_result_container = ``; - let modal_filters = make_modal_body_filters(); - let search_divider = `
`; + // This updates search results and toggles classes for UI: + update_search(); + }); - if (results.length) { - let links = []; - let count = 0; - let search_results = ""; - - for (var i = 0, n = results.length; i < n && count < 200; ++i) { - let result = results[i]; - if (result.location && !links.includes(result.location)) { - search_results += result.div; - count++; - links.push(result.location); - } - } + /** + * Make/Update the search component + */ + function update_search() { + let querystring = $(".documenter-search-input").val(); - if (count == 1) { - count_str = "1 result"; - } else if (count == 200) { - count_str = "200+ results"; + if (querystring.trim()) { + if (selected_filter == "") { + results = unfiltered_results; } else { - count_str = count + " results"; + results = unfiltered_results.filter((result) => { + return selected_filter == result.category.toLowerCase(); + }); } - let result_count = `
${count_str}
`; - search_result_container = ` + let search_result_container = ``; + let modal_filters = make_modal_body_filters(); + let search_divider = `
`; + + if (results.length) { + let links = []; + let count = 0; + let search_results = ""; + + for (var i = 0, n = results.length; i < n && count < 200; ++i) { + let result = results[i]; + if (result.location && !links.includes(result.location)) { + search_results += result.div; + count++; + links.push(result.location); + } + } + + if (count == 1) { + count_str = "1 result"; + } else if (count == 200) { + count_str = "200+ results"; + } else { + count_str = count + " results"; + } + let result_count = `
${count_str}
`; + + search_result_container = ` +
+ ${modal_filters} + ${search_divider} + ${result_count} +
+ ${search_results} +
+
+ `; + } else { + search_result_container = `
${modal_filters} ${search_divider} - ${result_count} -
- ${search_results} -
-
+
0 result(s)
+ +
No result found!
`; - } else { - search_result_container = ` -
- ${modal_filters} - ${search_divider} -
0 result(s)
-
-
No result found!
- `; - } + } - if ($(".search-modal-card-body").hasClass("is-justify-content-center")) { - $(".search-modal-card-body").removeClass("is-justify-content-center"); - } + if ($(".search-modal-card-body").hasClass("is-justify-content-center")) { + $(".search-modal-card-body").removeClass("is-justify-content-center"); + } - $(".search-modal-card-body").html(search_result_container); - } else { - if (!$(".search-modal-card-body").hasClass("is-justify-content-center")) { - $(".search-modal-card-body").addClass("is-justify-content-center"); + $(".search-modal-card-body").html(search_result_container); + } else { + if (!$(".search-modal-card-body").hasClass("is-justify-content-center")) { + $(".search-modal-card-body").addClass("is-justify-content-center"); + } + + $(".search-modal-card-body").html(` +
Type something to get started!
+ `); } + } - $(".search-modal-card-body").html(` -
Type something to get started!
- `); + /** + * Make the modal filter html + * + * @returns string + */ + function make_modal_body_filters() { + let str = filters + .map((val) => { + if (selected_filter == val.toLowerCase()) { + return `${val}`; + } else { + return `${val}`; + } + }) + .join(""); + + return ` +
+ Filters: + ${str} +
`; } } -/** - * Make the modal filter html - * - * @returns string - */ -function make_modal_body_filters() { - let str = filters - .map((val) => { - if (selected_filter == val.toLowerCase()) { - return `${val}`; - } else { - return `${val}`; - } - }) - .join(""); - - return ` -
- Filters: - ${str} -
`; +function waitUntilSearchIndexAvailable() { + // It is possible that the documenter.js script runs before the page + // has finished loading and documenterSearchIndex gets defined. + // So we need to wait until the search index actually loads before setting + // up all the search-related stuff. + if (typeof documenterSearchIndex !== "undefined") { + runSearchMainCode(); + } else { + console.warn("Search Index not available, waiting"); + setTimeout(waitUntilSearchIndexAvailable, 1000); + } } +// The actual entry point to the search code +waitUntilSearchIndexAvailable(); + }) //////////////////////////////////////////////////////////////////////////////// require(['jquery'], function($) { diff --git a/dev/index.html b/dev/index.html index 0b1ce1a7..fc14de7c 100644 --- a/dev/index.html +++ b/dev/index.html @@ -3,4 +3,4 @@ pkg"add https://github.com/OpenMendel/SnpArrays.jl" pkg"add https://github.com/OpenMendel/VCFTools.jl" pkg"add https://github.com/OpenMendel/BGEN.jl" -pkg"add https://github.com/OpenMendel/MendelImpute.jl"

This package supports Julia v1.6+.

Manual Outline

+pkg"add https://github.com/OpenMendel/MendelImpute.jl"

This package supports Julia v1.6+.

Manual Outline

diff --git a/dev/man/Phasing_and_Imputation/index.html b/dev/man/Phasing_and_Imputation/index.html index 8ee49c2d..0ebf78e2 100644 --- a/dev/man/Phasing_and_Imputation/index.html +++ b/dev/man/Phasing_and_Imputation/index.html @@ -145,4 +145,4 @@ # visualize error distribution histogram(quality[:error], label=:none, xlabel="per-sample quality score", - ylabel="Number of samples")

svg

Conclusion: Most samples are well imputed (e.g. score close to 1), but some samples are indeed poorly imputed. From the histogram, we can safely filtered out samples with score $< 0.999$ as that would remove poorly imputed individuals without reducing sample size too much.

+ ylabel="Number of samples")

svg

Conclusion: Most samples are well imputed (e.g. score close to 1), but some samples are indeed poorly imputed. From the histogram, we can safely filtered out samples with score $< 0.999$ as that would remove poorly imputed individuals without reducing sample size too much.

diff --git a/dev/man/api/index.html b/dev/man/api/index.html index 99f7dcf4..4a17dd48 100644 --- a/dev/man/api/index.html +++ b/dev/man/api/index.html @@ -6,4 +6,4 @@ [d::Int], [minwidth::Int], [overlap::Float64])

Cuts a haplotype matrix reffile into windows of variable width so that each window has less than d unique haplotypes. Saves result to outfile as a compressed binary format. All SNPs in tgtfile must be present in reffile. All genotypes in reffile must be phased and non-missing, and all genotype positions must be unique.

Inputs

Optional Inputs

Why is tgtfile required?

The unique haplotypes in each window is computed on the typed SNPs only. A genotype matrix tgtfile is used to identify the typed SNPs. In the future, hopefully we can pre-compute compressed haplotype panels for all genotyping platforms and provide them as downloadable files. But currently, users must run this function by themselves.

source
MendelImpute.convert_compressedFunction
convert_compressed(t<:Real, phaseinfo::String, reffile::String)

Converts phaseinfo into a phased genotype matrix of type t using the full reference haplotype panel H

Inputs

  • t: Type of matrix. If bool, genotypes are converted to a BitMatrix
  • phaseinfo: Vector of HaplotypeMosaicPairs stored in .jlso format
  • reffile: The complete (uncompressed) haplotype reference file

Output

  • X1: allele 1 of the phased genotype. Each column is a sample. X = X1 + X2.
  • X2: allele 2 of the phased genotype. Each column is a sample. X = X1 + X2.
  • phase: the original data structure after phasing and imputation.
  • sampleID: The ID's of each imputed person.
  • H: the complete reference haplotype panel. Columns of H are haplotypes.
source
convert_compressed(t<:Real, phaseinfo::Vector{HaplotypeMosaicPair}, H::AbstractMatrix)

Columns of H are haplotypes.

source
MendelImpute.admixture_globalFunction
admixture_global(tgtfile::String, reffile::String, 
     refID_to_population::Dict{String, String}, populations::Vector{String})

Computes global ancestry estimates for each sample in tgtfile using a labeled reference panel reffile.

Inputs

  • tgtfile: VCF or PLINK files. VCF files should end in .vcf or .vcf.gz. PLINK files should exclude .bim/.bed/.fam trailings but the trio must all be present in the same directory.
  • reffile: Reference haplotype file ending in .jlso (compressed binary files). See compress_haplotypes.
  • refID_to_population: A dictionary mapping each sample IDs in the haplotype reference panel to their population origin. For examples, see output of thousand_genome_population_to_superpopulation and thousand_genome_samples_to_super_population
  • populations: A vector of String containing unique populations present in values(refID_to_population).

Optional Inputs

  • Q_outfile: Output file name for the estimated Q matrix. Default Q_outfile="mendelimpute.ancestry.Q".
  • imputed_outfile: Output file name for the imputed genotypes ending in .jlso. Default impute_outfile = "mendelimpute.ancestry.Q.jlso"

Output

  • Q: A DataFrame containing estimated ancestry fractions. Each row is a sample. Matrix will be saved in mendelimpute.ancestry.Q
source
MendelImpute.admixture_localFunction
admixture_local(tgtfile::String, reffile::String, 
     refID_to_population::Dict{String, String}, populations::Vector{String},
-    population_colors::Vector{RGB{FixedPointNumbers.N0f8}})

Computes global ancestry estimates for each sample in tgtfile using a labeled reference panel reffile.

Inputs

  • tgtfile: VCF or PLINK files. VCF files should end in .vcf or .vcf.gz. PLINK files should exclude .bim/.bed/.fam trailings but the trio must all be present in the same directory.
  • reffile: Reference haplotype file ending in .jlso (compressed binary files). See compress_haplotypes.
  • refID_to_population: A dictionary mapping each sample IDs in the haplotype reference panel to their population origin. For examples, see output of thousand_genome_population_to_superpopulation and thousand_genome_samples_to_super_population
  • population: A list String containing unique populations present in values(refID_to_population).
  • population_colors: A vector of colors for each population. typeof(population_colors} should be Vector{RGB{FixedPointNumbers.N0f8}}

Output

  • Q: Matrix containing estimated ancestry fractions. Each row is a haplotype. Sample 1's haplotypes are in rows 1 and 2, sample 2's are in rows 3, 4...etc.
  • pop_colors: Matrix with sample dimension of Q storing colors.
source
MendelImpute.thousand_genome_samples_to_populationFunction
thousand_genome_samples_to_population()

Creates a dictionaries mapping sample IDs of 1000 genome project to 26 population codes.

Population code and super population codes are described here: https://www.internationalgenome.org/category/population/

source
MendelImpute.thousand_genome_samples_to_super_populationFunction
thousand_genome_samples_to_population()

Creates a dictionaries mapping sample IDs of 1000 genome project to 5 super population codes.

Population code and super population codes are described here: https://www.internationalgenome.org/category/population/

source
MendelImpute.thousand_genome_population_to_superpopulationFunction
thousand_genome_population_to_superpopulation()

Creates a dictionary mapping population codes of 1000 genome project to their super-population codes.

Population code and super population codes are described here: https://www.internationalgenome.org/category/population/

source
+ population_colors::Vector{RGB{FixedPointNumbers.N0f8}})

Computes global ancestry estimates for each sample in tgtfile using a labeled reference panel reffile.

Inputs

Output

source
MendelImpute.thousand_genome_samples_to_populationFunction
thousand_genome_samples_to_population()

Creates a dictionaries mapping sample IDs of 1000 genome project to 26 population codes.

Population code and super population codes are described here: https://www.internationalgenome.org/category/population/

source
MendelImpute.thousand_genome_samples_to_super_populationFunction
thousand_genome_samples_to_population()

Creates a dictionaries mapping sample IDs of 1000 genome project to 5 super population codes.

Population code and super population codes are described here: https://www.internationalgenome.org/category/population/

source
MendelImpute.thousand_genome_population_to_superpopulationFunction
thousand_genome_population_to_superpopulation()

Creates a dictionary mapping population codes of 1000 genome project to their super-population codes.

Population code and super population codes are described here: https://www.internationalgenome.org/category/population/

source
diff --git a/dev/man/painting/index.html b/dev/man/painting/index.html index eecc93a1..5c9927aa 100644 --- a/dev/man/painting/index.html +++ b/dev/man/painting/index.html @@ -158,4 +158,4 @@ inset = (1, bbox(-0.05, -0.1, 0.05, 1.1, :bottom, :right)), subplot = 2) # save figure -# savefig(local_plt, "local_admixture.png")

svg

Conclusion:

For more details, please refer to our paper, or file an issue on GitHub.

+# savefig(local_plt, "local_admixture.png")

svg

Conclusion:

For more details, please refer to our paper, or file an issue on GitHub.

diff --git a/dev/man/performance/index.html b/dev/man/performance/index.html index 3d761ad4..002f6eaa 100644 --- a/dev/man/performance/index.html +++ b/dev/man/performance/index.html @@ -1,2 +1,2 @@ -Performance Gotchas · MendelImpute

Performance gotchas

First time performance

In a fresh Julia session, the first time any function gets called will take a long time because the code has to be compiled on the spot. For instance, compare

@time using MendelImpute
  6.958706 seconds (16.72 M allocations: 1.148 GiB, 5.00% gc time)
@time using MendelImpute
  0.022658 seconds (32.81 k allocations: 1.886 MiB, 99.49% compilation time)

The first call was 350 times slower than the second time! Fortunately, for large problems, compilation time becomes negligible.

Run MendelImpute in parallel

If Julia is started with multiple threads (e.g. julia --threads 4), MendelImpute.jl will automatically run your code in parallel.

  1. How to start Julia with multiple threads.
  2. Execute Threads.nthreads() within Julia to check if multiple thread is enabled
  3. Set the number of BLAS threads to be 1 by using LinearAlgebra; BLAS.set_num_threads(1). This avoids oversubscription.
Note

We recommend number of threads equal to the number of physical CPU cores on your machine. Number of Julia threads should never exceed number of physical CPU cores!! Hyperthreading is valuable for I/O operations (in our experience), but not for linear algebra routines used throughout MendelImpute.

Compressing haplotype panels is slow

Currently it is recommended to build a new compressed reference haplotype panel for every new set of typed SNPs (although this is not strictly required). The compression routine is slow because reading raw VCF files is slow. Thus, it is highly advised that one try to use the same set of typed SNPs as much as possible.

We are actively developing a new set of functions in SnpArrays.jl to alleviate this problem. Since SnpArrays use memory mapping, read times can be improved dramatically.

max_d too high (or too low)

When you compress the haplotype panels into a .jlso format, you specified max_d which is the maximum number of unique haplotypes per window. We generally recommend using max_d = 1000, BUT 1000 may be too small if you use a reference panel larger than HRC. In that case, you can try larger max_d, which will improve error rate.

Symptoms for max_d too high:

Computing optimal haplotypes is too slow. In particular, the timing for haplopair search is too high.

Symptoms for max_d too low:

Too few typed SNPs per window indicates max_d is set too low. You can calculate the number of typed SNPs per window by dividing the total number of SNPs in the target file by the total windows (a number that will be output after every run). Ideally you want an average of 400 typed SNPs per window, but something as low as 50 still works. Something like 10~20 is too low.

I really want to use a high max_d

A high max_d generally improve error, so it is understandable you want to do so. If a high max_d value runs too slow, try setting stepwise = 100 and max_haplotypes to a number that is close to 1000. This avoids searching the global minimizer of the least-squares problem for windows that have more than max_haplotypes number of unique haplotypes. Setting thinning_factor instead of stepwise have a similar effect. Details for these 2 heuristic searches are explained in the appendix of our paper.

Do you have enough memory (RAM)?

While MendelImpute uses the least RAM compared to competing softwares (as of 2020), it is still possible for large imputation problems to consume all available RAM. If this happens, Julia will first try to use swap before crashing (until all of swap is consumed). Monitor your RAM usage constantly to make sure this doesn't happen. On Mac/Linux machines, the top or htop command will monitor this information. Alternatively, the /usr/bin/time command will automatically records max RAM usage for job and whether any swap had been performed.

Rough estimate for amount of RAM needed

There are 4 things that require lots of memory:

  • The target genotype matrix $\mathbf{X}_{n \times p}$ requires $n \times p \times 8$ bits. If $\mathbf{X}$ is dosage data, then you need instead $n \times p \times 32$ bits
  • The matrix $\mathbf{M}_{d \times d}$ requires $c \times d \times d \times 32$ bits, where $c$ is the number of parallel threads used and $d$ is the number specified in the compress_haplotypes function.
  • The matrix $\mathbf{N}_{n \times d}$ requires $c \times n \times d \times 32$ bits, where $c$ is the number of parallel threads used and $d$ is the number specified in the compress_haplotypes function.
  • The compressed reference haplotype panel produced by the compress_haplotypes function. This typically requires about $3r$ gigabytes of RAM where $r$ is your panel's size in .vcf.gz.

If you do not have the above issues and your code is still running slow, file an issue on GitHub and we will take a look at it ASAP.

+Performance Gotchas · MendelImpute

Performance gotchas

First time performance

In a fresh Julia session, the first time any function gets called will take a long time because the code has to be compiled on the spot. For instance, compare

@time using MendelImpute
  6.958706 seconds (16.72 M allocations: 1.148 GiB, 5.00% gc time)
@time using MendelImpute
  0.022658 seconds (32.81 k allocations: 1.886 MiB, 99.49% compilation time)

The first call was 350 times slower than the second time! Fortunately, for large problems, compilation time becomes negligible.

Run MendelImpute in parallel

If Julia is started with multiple threads (e.g. julia --threads 4), MendelImpute.jl will automatically run your code in parallel.

  1. How to start Julia with multiple threads.
  2. Execute Threads.nthreads() within Julia to check if multiple thread is enabled
  3. Set the number of BLAS threads to be 1 by using LinearAlgebra; BLAS.set_num_threads(1). This avoids oversubscription.
Note

We recommend number of threads equal to the number of physical CPU cores on your machine. Number of Julia threads should never exceed number of physical CPU cores!! Hyperthreading is valuable for I/O operations (in our experience), but not for linear algebra routines used throughout MendelImpute.

Compressing haplotype panels is slow

Currently it is recommended to build a new compressed reference haplotype panel for every new set of typed SNPs (although this is not strictly required). The compression routine is slow because reading raw VCF files is slow. Thus, it is highly advised that one try to use the same set of typed SNPs as much as possible.

We are actively developing a new set of functions in SnpArrays.jl to alleviate this problem. Since SnpArrays use memory mapping, read times can be improved dramatically.

max_d too high (or too low)

When you compress the haplotype panels into a .jlso format, you specified max_d which is the maximum number of unique haplotypes per window. We generally recommend using max_d = 1000, BUT 1000 may be too small if you use a reference panel larger than HRC. In that case, you can try larger max_d, which will improve error rate.

Symptoms for max_d too high:

Computing optimal haplotypes is too slow. In particular, the timing for haplopair search is too high.

Symptoms for max_d too low:

Too few typed SNPs per window indicates max_d is set too low. You can calculate the number of typed SNPs per window by dividing the total number of SNPs in the target file by the total windows (a number that will be output after every run). Ideally you want an average of 400 typed SNPs per window, but something as low as 50 still works. Something like 10~20 is too low.

I really want to use a high max_d

A high max_d generally improve error, so it is understandable you want to do so. If a high max_d value runs too slow, try setting stepwise = 100 and max_haplotypes to a number that is close to 1000. This avoids searching the global minimizer of the least-squares problem for windows that have more than max_haplotypes number of unique haplotypes. Setting thinning_factor instead of stepwise have a similar effect. Details for these 2 heuristic searches are explained in the appendix of our paper.

Do you have enough memory (RAM)?

While MendelImpute uses the least RAM compared to competing softwares (as of 2020), it is still possible for large imputation problems to consume all available RAM. If this happens, Julia will first try to use swap before crashing (until all of swap is consumed). Monitor your RAM usage constantly to make sure this doesn't happen. On Mac/Linux machines, the top or htop command will monitor this information. Alternatively, the /usr/bin/time command will automatically records max RAM usage for job and whether any swap had been performed.

Rough estimate for amount of RAM needed

There are 4 things that require lots of memory:

  • The target genotype matrix $\mathbf{X}_{n \times p}$ requires $n \times p \times 8$ bits. If $\mathbf{X}$ is dosage data, then you need instead $n \times p \times 32$ bits
  • The matrix $\mathbf{M}_{d \times d}$ requires $c \times d \times d \times 32$ bits, where $c$ is the number of parallel threads used and $d$ is the number specified in the compress_haplotypes function.
  • The matrix $\mathbf{N}_{n \times d}$ requires $c \times n \times d \times 32$ bits, where $c$ is the number of parallel threads used and $d$ is the number specified in the compress_haplotypes function.
  • The compressed reference haplotype panel produced by the compress_haplotypes function. This typically requires about $3r$ gigabytes of RAM where $r$ is your panel's size in .vcf.gz.

If you do not have the above issues and your code is still running slow, file an issue on GitHub and we will take a look at it ASAP.

diff --git a/dev/man/script/index.html b/dev/man/script/index.html index a95ce36a..e35ea399 100644 --- a/dev/man/script/index.html +++ b/dev/man/script/index.html @@ -9,4 +9,4 @@ BLAS.set_num_threads(1) # set BLAS threads to 1 (see performance gotchas) # run MendelImpute with default options -phase(tgtfile, reffile, outfile)

Then in the terminal/command-prompt, you can do

julia --threads 8 impute.jl ref.jlso target.vcf.gz output.vcf.gz
+phase(tgtfile, reffile, outfile)

Then in the terminal/command-prompt, you can do

julia --threads 8 impute.jl ref.jlso target.vcf.gz output.vcf.gz
diff --git a/dev/man/ultra+compress/index.html b/dev/man/ultra+compress/index.html index c918a063..53610df2 100644 --- a/dev/man/ultra+compress/index.html +++ b/dev/man/ultra+compress/index.html @@ -71,4 +71,4 @@ X1, X2, phaseinfo, sampleID, H = convert_compressed(Float64, tgtfile, reffile);
importing reference data...100%|████████████████████████| Time: 0:01:51

Check this compression protocol exhibit same error rate with standard VCF compression. Note that X1, X2, and H are transposed.

X_truth  = convert_gt(Float64, "target.chr22.full.vcf.gz") # import true genotypes
 X_mendel = (X1 + X2)' # transpose X1 and X2
 n, p = size(X_mendel)
-println("error overall = $(sum(X_mendel .!= X_truth) / n / p)")
error overall = 0.00527504782243333
+println("error overall = $(sum(X_mendel .!= X_truth) / n / p)")
error overall = 0.00527504782243333