From e19b169d918eebb14f37f299704277934d092d28 Mon Sep 17 00:00:00 2001 From: Andrey Prjibelski Date: Wed, 8 May 2024 23:26:11 +0300 Subject: [PATCH] cosmetic changes --- README.md | 4 ++++ src/dataset_processor.py | 8 ++++---- src/file_utils.py | 1 + src/gene_info.py | 2 +- src/input_data_storage.py | 1 - 5 files changed, 10 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 0cb8d9e7..01d2a109 100644 --- a/README.md +++ b/README.md @@ -106,6 +106,8 @@ Reads must be provided in FASTQ or FASTA format (can be gzipped). If you have al IsoQuant expect reads to contain polyA tails. For more reliable transcript model construction do not trim polyA tails. +IsoQuant can also take aligned Illumina reads to correct long-read spliced alignments. However, short reads are _not_ +used to discover transcript models or compute abundances. ## Supported reference data @@ -116,6 +118,8 @@ Reference genome is mandatory even when BAM files are provided. Reference gene annotation is not mandatory, but is likely to increase precision and recall. It can be provided in GFF/GTF format (can be gzipped). In this case it will be converted to [gffutils](https://pythonhosted.org/gffutils/installation.html) database. Information on converted databases will be stored in your `~/.config/IsoQuant/db_config.json` to increase speed of future runs. You can also provide gffutils database manually. Make sure that chromosome/scaffold names are identical in FASTA file and gene annotation. +Note, that gffutils databases may not work correctly on NFS shares. It is possible to set a designated folder for +the database with `--genedb_output` (different from the output directory). Pre-constructed aligner index can also be provided to increase mapping time. diff --git a/src/dataset_processor.py b/src/dataset_processor.py index d069ac18..d68fee26 100644 --- a/src/dataset_processor.py +++ b/src/dataset_processor.py @@ -410,15 +410,15 @@ def __del__(self): os.remove(self.args.gunzipped_reference) def process_all_samples(self, input_data): - logger.info("Processing " + proper_plural_form("sample", len(input_data.samples))) + logger.info("Processing " + proper_plural_form("experiment", len(input_data.samples))) for sample in input_data.samples: self.process_sample(sample) - logger.info("Processed " + proper_plural_form("sample", len(input_data.samples))) + logger.info("Processed " + proper_plural_form("experiment", len(input_data.samples))) # Run through all genes in db and count stats according to alignments given in bamfile_name def process_sample(self, sample): - logger.info("Processing sample " + sample.prefix) - logger.info("Sample has " + proper_plural_form("BAM file", len(sample.file_list)) + ": " + ", ".join( + logger.info("Processing experiment " + sample.prefix) + logger.info("Experiment has " + proper_plural_form("BAM file", len(sample.file_list)) + ": " + ", ".join( map(lambda x: x[0], sample.file_list))) self.args.use_technical_replicas = self.args.read_group == "file_name" and len(sample.file_list) > 1 diff --git a/src/file_utils.py b/src/file_utils.py index 00736a5b..c335483c 100644 --- a/src/file_utils.py +++ b/src/file_utils.py @@ -60,6 +60,7 @@ def merge_counts(counter, label, chr_ids, unaligned_reads=0): merged_file_handler.write("%s\t%d\n" % (v, stat_dict[v])) counter.reads_for_tpm = stat_dict[ "__usable"] + def normalize_path(config_path, file_path): if os.path.isabs(file_path): return os.path.normpath(file_path) diff --git a/src/gene_info.py b/src/gene_info.py index 9e74c652..a371a260 100644 --- a/src/gene_info.py +++ b/src/gene_info.py @@ -529,7 +529,7 @@ def set_introns_and_exons(self): def set_feature_properties(self, isoforms_to_feature_map, feature_profiles): similar_features = set() contained_features = set() - # FIXME: change to interval tree instead of brute force + # TODO: change to interval tree instead of brute force for f1 in feature_profiles.features: for f2 in feature_profiles.features: if f1 == f2: diff --git a/src/input_data_storage.py b/src/input_data_storage.py index 9a7e7287..72f865af 100644 --- a/src/input_data_storage.py +++ b/src/input_data_storage.py @@ -196,7 +196,6 @@ def has_replicas(self): return any(len(sample.file_list) > 1 for sample in self.samples) def get_samples_from_yaml(self, yaml_file_path): - # TODO: allow relative paths, i.e. introduce "path fixer" for non-abosulte paths (relative to YAML file) sample_files = [] experiment_names = [] illumina_bam = []