diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..51227d7 Binary files /dev/null and b/.DS_Store differ diff --git a/.gitignore b/.gitignore index cb5fc37..08a404d 100644 --- a/.gitignore +++ b/.gitignore @@ -160,4 +160,9 @@ cython_debug/ #.idea/ # IDE -.vscode/ \ No newline at end of file +.vscode/ +.devcontainer + +# testing files +sandbox/ +file_diffs/ \ No newline at end of file diff --git a/README.md b/README.md index 2a09ae8..30d5bc7 100644 --- a/README.md +++ b/README.md @@ -72,16 +72,15 @@ column2 SET column3 0.01 ``` -Currently implmented validation criteria include: +Currently implemented validation criteria include: | validation_criteria | explanation | | --- | --- | -| EXACT | the values in the two columns must be exactly the same; in this case `[foo,bar] != [bar,foo]` | -| SET | the values in the two columns must be the same set of values; in this case `[foo,bar] == [bar,foo]` | -| \ | the values in the two columns must be within `*100` of each other; e.g., 0.3 -> 30% difference allowed | -| IGNORE | the values in the two columns are assumed to match; in this case `foo == bar` | +| EXACT | The values in the two columns must be exactly the same; in this case `[foo,bar] != [bar,foo]`. When applied to columns referencing files, file contents will be compared to check if they are identical.| +| SET | The values in the two columns must be the same set of values; in this case `[foo,bar] == [bar,foo]`. When applied to columns referencing files, the lines within the files will be sorted alphabetically before comparing.| +| \ | The values in the two columns must be within `*100` of each other; e.g., 0.3 -> 30% difference allowed. | +| IGNORE | The values in the two columns are assumed to match; in this case `foo == bar`. | -Future comparisons to include `FILE-EXACT`, `FILE-SET`, `FILE-`. #### Optional: `column_translation` @@ -149,3 +148,6 @@ This file (available as an HTML and PDF) is a summary of the differences between - the number of samples failing the validation criteria If a `validation_criteria.tsv` file was provided, a definition of the (currently implemented) validation criteria are provided at the bottom of the table + +#### `__diff.txt` +Shows the differing lines within mismatching files for a given sample and column. Each pair of mismatching files generates a separate file. \ No newline at end of file diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..c0986f5 --- /dev/null +++ b/__init__.py @@ -0,0 +1,2 @@ +__VERSION__ = "v0.0.1" +import os, sys; sys.path.append(os.path.dirname(os.path.realpath(__file__))) \ No newline at end of file diff --git a/examples/file_comparison/example-validation_criteria_exact_sort_file.tsv b/examples/file_comparison/example-validation_criteria_exact_sort_file.tsv new file mode 100644 index 0000000..1590aa0 --- /dev/null +++ b/examples/file_comparison/example-validation_criteria_exact_sort_file.tsv @@ -0,0 +1,7 @@ +column criteria +assembly_length 0.01 +gambit_predicted_taxon EXACT +amrfinderplus_amr_core_genes SET +extra_column IGNORE +file_column EXACT +sort_file_column SET diff --git a/examples/file_comparison/file_comparison_column_translation.tsv b/examples/file_comparison/file_comparison_column_translation.tsv new file mode 100644 index 0000000..3cf7192 --- /dev/null +++ b/examples/file_comparison/file_comparison_column_translation.tsv @@ -0,0 +1,2 @@ +amrfinderplus_amr_genes amrfinderplus_amr_core_genes +extra_column2 extra_column \ No newline at end of file diff --git a/examples/file_comparison/file_comparison_columns_to_compare.txt b/examples/file_comparison/file_comparison_columns_to_compare.txt new file mode 100644 index 0000000..d67db40 --- /dev/null +++ b/examples/file_comparison/file_comparison_columns_to_compare.txt @@ -0,0 +1 @@ +"assembly_length,gambit_predicted_taxon,amrfinderplus_amr_core_genes,extra_column,file_column,sort_file_column" \ No newline at end of file diff --git a/examples/file_comparison/file_comparison_table1.tsv b/examples/file_comparison/file_comparison_table1.tsv new file mode 100644 index 0000000..1d42049 --- /dev/null +++ b/examples/file_comparison/file_comparison_table1.tsv @@ -0,0 +1,6 @@ +entity:table1_with_files_id amrfinderplus_amr_core_genes assembly_length extra_column file_column gambit_predicted_taxon sort_file_column +sample01 tet(A),aph(6)-Id,aph(3'')-Ib 4783605 extra_value gs://path/to/table1_files/match1-1.txt Salmonella enterica gs://path/to/table1_files/match1-1.txt +sample02 glpT_E448K,gyrA_D87G,gyrA_S83L,sat2,dfrA1,parC_S80I,blaCTX-M-27 5226301 gs://path/to/table1_files/mismatch1-1.txt Shigella sonnei gs://path/to/table1_files/mismatch1-1.txt +sample03 4719410 extra_value gs://path/to/table1_files/mismatch2-1.txt Shigella gs://path/to/table1_files/sortmatch1-1.txt +sample04 sul1,aadA7,parC_S87L,gyrA_T83I 6674526 gs://path/to/table1_files/mismatch2-1.txt Pseudomonas aeruginosa gs://path/to/table1_files/mismatch1-1.txt +sample05 parC_S80Y,tet(38),mecR1,murA_G257D,fosB,gyrA_S84L,mecA 2773544 Staphylococcus aureus diff --git a/examples/file_comparison/file_comparison_table2.tsv b/examples/file_comparison/file_comparison_table2.tsv new file mode 100644 index 0000000..0e39e38 --- /dev/null +++ b/examples/file_comparison/file_comparison_table2.tsv @@ -0,0 +1,6 @@ +entity:table2_with_files_id amrfinderplus_amr_genes assembly_length extra_column2 file_column gambit_predicted_taxon sort_file_column +sample01 aph(3'')-Ib,aph(6)-Id,tet(A) 4783610 extra_value gs://path/to/table2_files/match1-1.txt Salmonella enterica gs://path/to/table2_files/match1-1.txt +sample02 glpT_E448K,gyrA_D87G,gyrA_S83L,parC_S80I,blaCTX-M-27,sat2,dfrA1 5274928 gs://path/to/table2_files/mismatch1-1.txt Shigella sonnei gs://path/to/table2_files/mismatch1-1.txt +sample03 glpT_E448K,gyrA_D87G,gyrA_S83L,sat2 5287603 gs://path/to/table2_files/mismatch2-1.txt Shigella sonnei gs://path/to/table2_files/sortmatch1-1.txt +sample04 parC_S87L,gyrA_T83I,sul1,aadA7 6674503 extra_value Pseudomonas aeruginosa +sample05 parC_S80Y,tet(38),fosB,gyrA_S84L,mecA,mecR1,murA_G257D 2771914 Staphylococcus aureus diff --git a/examples/file_comparison/outputs/diffs/sample02_file_column_diff.txt b/examples/file_comparison/outputs/diffs/sample02_file_column_diff.txt new file mode 100644 index 0000000..c6aa9ad --- /dev/null +++ b/examples/file_comparison/outputs/diffs/sample02_file_column_diff.txt @@ -0,0 +1,5 @@ +--- table1_files/path/to/table1_files/mismatch1-1.txt+++ table2_files/path/to/table2_files/mismatch1-1.txt@@ -1,3 +1,3 @@-foo +-bar ++eggs ++spam + diff --git a/examples/file_comparison/outputs/diffs/sample02_sort_file_column_diff.txt b/examples/file_comparison/outputs/diffs/sample02_sort_file_column_diff.txt new file mode 100644 index 0000000..c6aa9ad --- /dev/null +++ b/examples/file_comparison/outputs/diffs/sample02_sort_file_column_diff.txt @@ -0,0 +1,5 @@ +--- table1_files/path/to/table1_files/mismatch1-1.txt+++ table2_files/path/to/table2_files/mismatch1-1.txt@@ -1,3 +1,3 @@-foo +-bar ++eggs ++spam + diff --git a/examples/file_comparison/outputs/diffs/sample03_file_column_diff.txt b/examples/file_comparison/outputs/diffs/sample03_file_column_diff.txt new file mode 100644 index 0000000..aebe16f --- /dev/null +++ b/examples/file_comparison/outputs/diffs/sample03_file_column_diff.txt @@ -0,0 +1,3 @@ +--- table1_files/path/to/table1_files/mismatch2-1.txt+++ table2_files/path/to/table2_files/mismatch2-1.txt@@ -1,2 +1 @@-1 2 3 +- ++1 2 diff --git a/examples/file_comparison/outputs/diffs/sample03_sort_file_column_diff.txt b/examples/file_comparison/outputs/diffs/sample03_sort_file_column_diff.txt new file mode 100644 index 0000000..fad4e1e --- /dev/null +++ b/examples/file_comparison/outputs/diffs/sample03_sort_file_column_diff.txt @@ -0,0 +1,4 @@ +--- table1_files/path/to/table1_files/sortmatch1-1.txt+++ table2_files/path/to/table2_files/sortmatch1-1.txt@@ -1,3 +1,3 @@+baz + foo + bar +-baz diff --git a/examples/file_comparison/outputs/file_comparison_exact_differences.tsv b/examples/file_comparison/outputs/file_comparison_exact_differences.tsv new file mode 100644 index 0000000..9e07948 --- /dev/null +++ b/examples/file_comparison/outputs/file_comparison_exact_differences.tsv @@ -0,0 +1,8 @@ + amrfinderplus_amr_core_genes amrfinderplus_amr_core_genes assembly_length assembly_length extra_column extra_column gambit_predicted_taxon gambit_predicted_taxon sort_file_column sort_file_column file_column file_column + table1_with_files.tsv table2_with_files.tsv table1_with_files.tsv table2_with_files.tsv table1_with_files.tsv table2_with_files.tsv table1_with_files.tsv table2_with_files.tsv table1_with_files.tsv table2_with_files.tsv table1_with_files.tsv table2_with_files.tsv +samples +sample01 tet(A),aph(6)-Id,aph(3'')-Ib aph(3'')-Ib,aph(6)-Id,tet(A) 4783605 4783610 +sample02 glpT_E448K,gyrA_D87G,gyrA_S83L,sat2,dfrA1,parC_S80I,blaCTX-M-27 glpT_E448K,gyrA_D87G,gyrA_S83L,parC_S80I,blaCTX-M-27,sat2,dfrA1 5226301 5274928 gs://path/to/table1_files/mismatch1-1.txt gs://path/to/table2_files/mismatch1-1.txt gs://path/to/table1_files/mismatch1-1.txt gs://path/to/table2_files/mismatch1-1.txt +sample03 glpT_E448K,gyrA_D87G,gyrA_S83L,sat2 4719410 5287603 extra_value Shigella Shigella sonnei gs://path/to/table1_files/sortmatch1-1.txt gs://path/to/table2_files/sortmatch1-1.txt gs://path/to/table1_files/mismatch2-1.txt gs://path/to/table2_files/mismatch2-1.txt +sample04 sul1,aadA7,parC_S87L,gyrA_T83I parC_S87L,gyrA_T83I,sul1,aadA7 6674526 6674503 extra_value gs://path/to/table1_files/mismatch1-1.txt gs://path/to/table1_files/mismatch2-1.txt +sample05 parC_S80Y,tet(38),mecR1,murA_G257D,fosB,gyrA_S84L,mecA parC_S80Y,tet(38),fosB,gyrA_S84L,mecA,mecR1,murA_G257D 2773544 2771914 diff --git a/examples/file_comparison/outputs/file_comparison_summary.pdf b/examples/file_comparison/outputs/file_comparison_summary.pdf new file mode 100644 index 0000000..f36eb86 Binary files /dev/null and b/examples/file_comparison/outputs/file_comparison_summary.pdf differ diff --git a/examples/file_comparison/outputs/file_comparison_validation_criteria_differences (2).tsv b/examples/file_comparison/outputs/file_comparison_validation_criteria_differences (2).tsv new file mode 100644 index 0000000..8dd9d37 --- /dev/null +++ b/examples/file_comparison/outputs/file_comparison_validation_criteria_differences (2).tsv @@ -0,0 +1,7 @@ +Column assembly_length assembly_length gambit_predicted_taxon gambit_predicted_taxon amrfinderplus_amr_core_genes amrfinderplus_amr_core_genes file_column file_column sort_file_column sort_file_column +Table table1_with_files.tsv table2_with_files.tsv table1_with_files.tsv table2_with_files.tsv table1_with_files.tsv table2_with_files.tsv table1_with_files.tsv table2_with_files.tsv table1_with_files.tsv table2_with_files.tsv +sample01 +sample02 gs://path/to/table1_files/mismatch1-1.txt gs://path/to/table2_files/mismatch1-1.txt gs://path/to/table1_files/mismatch1-1.txt gs://path/to/table2_files/mismatch1-1.txt +sample03 4719410.0 5287603.0 Shigella Shigella sonnei glpT_E448K,gyrA_D87G,gyrA_S83L,sat2 gs://path/to/table1_files/mismatch2-1.txt gs://path/to/table2_files/mismatch2-1.txt +sample04 gs://path/to/table1_files/mismatch2-1.txt gs://path/to/table1_files/mismatch1-1.txt +sample05 diff --git a/examples/file_comparison/outputs/filtered_file_comparison_table1.tsv b/examples/file_comparison/outputs/filtered_file_comparison_table1.tsv new file mode 100644 index 0000000..ad16c0e --- /dev/null +++ b/examples/file_comparison/outputs/filtered_file_comparison_table1.tsv @@ -0,0 +1,6 @@ +samples amrfinderplus_amr_core_genes assembly_length extra_column file_column gambit_predicted_taxon sort_file_column +sample01 tet(A),aph(6)-Id,aph(3'')-Ib 4783605 extra_value gs://path/to/table1_files/match1-1.txt Salmonella enterica gs://path/to/table1_files/match1-1.txt +sample02 glpT_E448K,gyrA_D87G,gyrA_S83L,sat2,dfrA1,parC_S80I,blaCTX-M-27 5226301 gs://path/to/table1_files/mismatch1-1.txt Shigella sonnei gs://path/to/table1_files/mismatch1-1.txt +sample03 4719410 extra_value gs://path/to/table1_files/mismatch2-1.txt Shigella gs://path/to/table1_files/sortmatch1-1.txt +sample04 sul1,aadA7,parC_S87L,gyrA_T83I 6674526 gs://path/to/table1_files/mismatch2-1.txt Pseudomonas aeruginosa gs://path/to/table1_files/mismatch1-1.txt +sample05 parC_S80Y,tet(38),mecR1,murA_G257D,fosB,gyrA_S84L,mecA 2773544 Staphylococcus aureus diff --git a/examples/file_comparison/outputs/filtered_file_comparison_table2.tsv b/examples/file_comparison/outputs/filtered_file_comparison_table2.tsv new file mode 100644 index 0000000..dfc036f --- /dev/null +++ b/examples/file_comparison/outputs/filtered_file_comparison_table2.tsv @@ -0,0 +1,6 @@ +samples amrfinderplus_amr_core_genes assembly_length extra_column file_column gambit_predicted_taxon sort_file_column +sample01 aph(3'')-Ib,aph(6)-Id,tet(A) 4783610 extra_value gs://path/to/table2_files/match1-1.txt Salmonella enterica gs://path/to/table2_files/match1-1.txt +sample02 glpT_E448K,gyrA_D87G,gyrA_S83L,parC_S80I,blaCTX-M-27,sat2,dfrA1 5274928 gs://path/to/table2_files/mismatch1-1.txt Shigella sonnei gs://path/to/table2_files/mismatch1-1.txt +sample03 glpT_E448K,gyrA_D87G,gyrA_S83L,sat2 5287603 gs://path/to/table2_files/mismatch2-1.txt Shigella sonnei gs://path/to/table2_files/sortmatch1-1.txt +sample04 parC_S87L,gyrA_T83I,sul1,aadA7 6674503 extra_value Pseudomonas aeruginosa +sample05 parC_S80Y,tet(38),fosB,gyrA_S84L,mecA,mecR1,murA_G257D 2771914 Staphylococcus aureus diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..c0986f5 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1,2 @@ +__VERSION__ = "v0.0.1" +import os, sys; sys.path.append(os.path.dirname(os.path.realpath(__file__))) \ No newline at end of file diff --git a/tests/table1_files/match1-1.txt b/tests/table1_files/match1-1.txt new file mode 100644 index 0000000..7bd112e --- /dev/null +++ b/tests/table1_files/match1-1.txt @@ -0,0 +1,3 @@ +foo +bar + diff --git a/tests/table1_files/match1-2.txt b/tests/table1_files/match1-2.txt new file mode 100644 index 0000000..42f0295 --- /dev/null +++ b/tests/table1_files/match1-2.txt @@ -0,0 +1,3 @@ +baz +eggs + diff --git a/tests/table1_files/match1-3.txt b/tests/table1_files/match1-3.txt new file mode 100644 index 0000000..fe05684 --- /dev/null +++ b/tests/table1_files/match1-3.txt @@ -0,0 +1,3 @@ +spam +monty + diff --git a/tests/table1_files/match2-1.txt b/tests/table1_files/match2-1.txt new file mode 100644 index 0000000..2b70035 --- /dev/null +++ b/tests/table1_files/match2-1.txt @@ -0,0 +1,2 @@ +1 2 3 + diff --git a/tests/table1_files/match2-2.txt b/tests/table1_files/match2-2.txt new file mode 100644 index 0000000..8db5eef --- /dev/null +++ b/tests/table1_files/match2-2.txt @@ -0,0 +1,2 @@ +4 5 6 + diff --git a/tests/table1_files/match2-3.txt b/tests/table1_files/match2-3.txt new file mode 100644 index 0000000..ee64adb --- /dev/null +++ b/tests/table1_files/match2-3.txt @@ -0,0 +1,2 @@ +7 8 9 + diff --git a/tests/table1_files/mismatch1-1.txt b/tests/table1_files/mismatch1-1.txt new file mode 100644 index 0000000..7bd112e --- /dev/null +++ b/tests/table1_files/mismatch1-1.txt @@ -0,0 +1,3 @@ +foo +bar + diff --git a/tests/table1_files/mismatch1-2.txt b/tests/table1_files/mismatch1-2.txt new file mode 100644 index 0000000..75d7bfb --- /dev/null +++ b/tests/table1_files/mismatch1-2.txt @@ -0,0 +1,2 @@ +foo + diff --git a/tests/table1_files/mismatch1-3.txt b/tests/table1_files/mismatch1-3.txt new file mode 100644 index 0000000..d86174f --- /dev/null +++ b/tests/table1_files/mismatch1-3.txt @@ -0,0 +1,4 @@ + +spam +eggs + diff --git a/tests/table1_files/mismatch2-1.txt b/tests/table1_files/mismatch2-1.txt new file mode 100644 index 0000000..2b70035 --- /dev/null +++ b/tests/table1_files/mismatch2-1.txt @@ -0,0 +1,2 @@ +1 2 3 + diff --git a/tests/table1_files/mismatch2-2.txt b/tests/table1_files/mismatch2-2.txt new file mode 100644 index 0000000..a28f8ae --- /dev/null +++ b/tests/table1_files/mismatch2-2.txt @@ -0,0 +1,2 @@ +5 6 6 + diff --git a/tests/table1_files/mismatch2-3.txt b/tests/table1_files/mismatch2-3.txt new file mode 100644 index 0000000..ae0e511 --- /dev/null +++ b/tests/table1_files/mismatch2-3.txt @@ -0,0 +1,2 @@ +hello, world + diff --git a/tests/table1_files/sortmatch1-1.txt b/tests/table1_files/sortmatch1-1.txt new file mode 100644 index 0000000..86e041d --- /dev/null +++ b/tests/table1_files/sortmatch1-1.txt @@ -0,0 +1,3 @@ +foo +bar +baz diff --git a/tests/table2_files/match1-1.txt b/tests/table2_files/match1-1.txt new file mode 100644 index 0000000..7bd112e --- /dev/null +++ b/tests/table2_files/match1-1.txt @@ -0,0 +1,3 @@ +foo +bar + diff --git a/tests/table2_files/match1-2.txt b/tests/table2_files/match1-2.txt new file mode 100644 index 0000000..42f0295 --- /dev/null +++ b/tests/table2_files/match1-2.txt @@ -0,0 +1,3 @@ +baz +eggs + diff --git a/tests/table2_files/match1-3.txt b/tests/table2_files/match1-3.txt new file mode 100644 index 0000000..fe05684 --- /dev/null +++ b/tests/table2_files/match1-3.txt @@ -0,0 +1,3 @@ +spam +monty + diff --git a/tests/table2_files/match2-1.txt b/tests/table2_files/match2-1.txt new file mode 100644 index 0000000..2b70035 --- /dev/null +++ b/tests/table2_files/match2-1.txt @@ -0,0 +1,2 @@ +1 2 3 + diff --git a/tests/table2_files/match2-2.txt b/tests/table2_files/match2-2.txt new file mode 100644 index 0000000..8db5eef --- /dev/null +++ b/tests/table2_files/match2-2.txt @@ -0,0 +1,2 @@ +4 5 6 + diff --git a/tests/table2_files/match2-3.txt b/tests/table2_files/match2-3.txt new file mode 100644 index 0000000..ee64adb --- /dev/null +++ b/tests/table2_files/match2-3.txt @@ -0,0 +1,2 @@ +7 8 9 + diff --git a/tests/table2_files/mismatch1-1.txt b/tests/table2_files/mismatch1-1.txt new file mode 100644 index 0000000..34ae2c6 --- /dev/null +++ b/tests/table2_files/mismatch1-1.txt @@ -0,0 +1,3 @@ +eggs +spam + diff --git a/tests/table2_files/mismatch1-2.txt b/tests/table2_files/mismatch1-2.txt new file mode 100644 index 0000000..7cd519a --- /dev/null +++ b/tests/table2_files/mismatch1-2.txt @@ -0,0 +1,3 @@ +foo +foo + diff --git a/tests/table2_files/mismatch1-3.txt b/tests/table2_files/mismatch1-3.txt new file mode 100644 index 0000000..fbabddf --- /dev/null +++ b/tests/table2_files/mismatch1-3.txt @@ -0,0 +1,3 @@ +spam + +eggs diff --git a/tests/table2_files/mismatch2-1.txt b/tests/table2_files/mismatch2-1.txt new file mode 100644 index 0000000..8d04f96 --- /dev/null +++ b/tests/table2_files/mismatch2-1.txt @@ -0,0 +1 @@ +1 2 diff --git a/tests/table2_files/mismatch2-2.txt b/tests/table2_files/mismatch2-2.txt new file mode 100644 index 0000000..336a0f9 --- /dev/null +++ b/tests/table2_files/mismatch2-2.txt @@ -0,0 +1,2 @@ +4 5 6 + diff --git a/tests/table2_files/mismatch2-3.txt b/tests/table2_files/mismatch2-3.txt new file mode 100644 index 0000000..270c611 --- /dev/null +++ b/tests/table2_files/mismatch2-3.txt @@ -0,0 +1 @@ +hello, world! diff --git a/tests/table2_files/sortmatch1-1.txt b/tests/table2_files/sortmatch1-1.txt new file mode 100644 index 0000000..4fc6926 --- /dev/null +++ b/tests/table2_files/sortmatch1-1.txt @@ -0,0 +1,3 @@ +baz +foo +bar diff --git a/tests/test_validator.py b/tests/test_validator.py new file mode 100644 index 0000000..453f79f --- /dev/null +++ b/tests/test_validator.py @@ -0,0 +1,437 @@ +# To run these unit tests, run "python3 -m unittest" from the root of the +# project directory. + +from theiavalidate.Validator import Validator +from theiavalidate.theiavalidate import DEFAULT_NA_VALUES + +import numpy as np +import pandas as pd +import unittest + + +class MockOptions: + """ + Mock the "options" object that is created in theiavalidate.py. In + theiavalidate.py, this object is created from command-line arguments using + the argparse package, but here we will simulate this object with a + different class to more easily create Validator objects. + """ + def __init__(self): + self.table1 = None + self.table2 = None + self.version = None + self.columns_to_compare = [] + self.validation_criteria = None + self.column_translation = None + self.output_prefix = None + self.na_values = DEFAULT_NA_VALUES + self.verbose = False + self.debug = False + + +class TestDetermineFileColumns(unittest.TestCase): + """ + Test detecting which columns in the tables correspond to files. If there is at + least one URI and no other values except np.nan in both tables, we should + treat the column as a "file_column". + """ + def setUp(self): + self.validator = Validator(MockOptions()) + + def run_determine_file_columns(self, data1, data2): + self.validator.table1 = pd.DataFrame(data1) + self.validator.table2 = pd.DataFrame(data2) + self.validator.determine_file_columns() + + def test_no_file_columns(self): + data = { + "col1": [1, 2, 3], + "col2": ["foo", "bar", "baz"] + } + self.run_determine_file_columns(data, data) + self.assertEqual(len(self.validator.file_columns), 0) + + def test_some_file_columns(self): + data1 = { + "col1": [1, 2, 3], + "col2": ["gs://foo", "gs://bar", "gs://baz"] + } + data2 = { + "col1": [1, 2, 3], + "col2": ["gs://eggs", "gs://spam", "gs://monty"] + } + self.run_determine_file_columns(data1, data2) + self.assertEqual(self.validator.file_columns, {"col2"}) + + def test_missing_uri(self): + data1 = { + "col1": [1, 2, 3], + "col2": ["gs://foo", np.nan, "gs://baz"] + } + data2 = { + "col1": [1, 2, 3], + "col2": ["gs://eggs", "gs://spam", "gs://monty"] + } + self.run_determine_file_columns(data1, data2) + self.assertEqual(self.validator.file_columns, {"col2"}) + + def test_both_columns_null(self): + data1 = { + "col1": ["gs://foo", "gs://bar", "gs://baz"], + "col2": [np.nan, np.nan, np.nan] + } + data2 = { + "col1": ["gs://eggs", "gs://spam", "gs://monty"], + "col2": [np.nan, np.nan, np.nan] + } + self.run_determine_file_columns(data1, data2) + self.assertEqual(self.validator.file_columns, {"col1"}) + + def test_one_column_null(self): + data1 = { + "col1": ["gs://foo", "gs://bar", "gs://baz"], + "col2": ["gs://x", "gs://y", "gs://z"] + } + data2 = { + "col1": ["gs://eggs", "gs://spam", "gs://monty"], + "col2": [np.nan, np.nan, np.nan] + } + self.run_determine_file_columns(data1, data2) + self.assertEqual(self.validator.file_columns, {"col1", "col2"}) + + def test_mixed_nulls(self): + data1 = { + "col1": ["gs://foo", "gs://foo", np.nan], + "col2": ["gs://x", "gs://y", np.nan] + } + data2 = { + "col1": ["gs://eggs", np.nan, np.nan], + "col2": [np.nan, "gs://b", np.nan] + } + self.run_determine_file_columns(data1, data2) + self.assertEqual(self.validator.file_columns, {"col1", "col2"}) + + def test_one_column_not_null(self): + data1 = { + "col1": ["gs://foo", "gs://bar", "gs://baz"], + "col2": ["gs://x", "gs://y", "gs://z"] + } + data2 = { + "col1": ["gs://eggs", "gs://spam", "gs://monty"], + "col2": [1, 2, 3] + } + self.run_determine_file_columns(data1, data2) + self.assertEqual(self.validator.file_columns, {"col1"}) + + +class TestCompareFiles(unittest.TestCase): + """ + Test comparing files (exact match). Identical files or two np.nans + should count as an exact match, anything else should count as a mismatch. + """ + SAMPLES_INDEX = ["sample1", "sample2", "sample3"] + COLUMNS_INDEX = ["col1", "col2"] + + def setUp(self): + self.validator = Validator(MockOptions()) + self.validator.table1_name = "table1" + self.validator.table2_name = "table2" + self.validator.table1_files_dir = "tests/table1_files" + self.validator.table2_files_dir = "tests/table2_files" + self.diff_dir = "/dev/null" # discard diff files + + def create_matching_files_tables(self): + df1 = pd.DataFrame({ + "col1": ["gs://match1-1.txt", "gs://match1-2.txt", "gs://match1-3.txt"], + "col2": ["gs://match2-1.txt", "gs://match2-2.txt", "gs://match2-3.txt"] + }) + df2 = pd.DataFrame({ + "col1": ["gs://match1-1.txt", "gs://match1-2.txt", "gs://match1-3.txt"], + "col2": ["gs://match2-1.txt", "gs://match2-2.txt", "gs://match2-3.txt"] + }) + for df in [df1, df2]: + df.index = self.SAMPLES_INDEX + return df1, df2 + + def create_mismatching_files_tables(self): + df1 = pd.DataFrame({ + "col1": ["gs://mismatch1-1.txt", "gs://mismatch1-2.txt", "gs://mismatch1-3.txt"], + "col2": ["gs://mismatch2-1.txt", "gs://mismatch2-2.txt", "gs://mismatch2-3.txt"] + }) + df2 = pd.DataFrame({ + "col1": ["gs://mismatch1-1.txt", "gs://mismatch1-2.txt", "gs://mismatch1-3.txt"], + "col2": ["gs://mismatch2-1.txt", "gs://mismatch2-2.txt", "gs://mismatch2-3.txt"] + }) + for df in [df1, df2]: + df.index = self.SAMPLES_INDEX + return df1, df2 + + def create_mix_matching_files_tables(self): + df1 = pd.DataFrame({ + "col1": ["gs://match1-1.txt", "gs://match1-2.txt", "gs://match1-3.txt"], + "col2": ["gs://mismatch2-1.txt", "gs://match2-2.txt", "gs://mismatch2-3.txt"] + }) + df2 = pd.DataFrame({ + "col1": ["gs://match1-1.txt", "gs://match1-2.txt", "gs://match1-3.txt"], + "col2": ["gs://mismatch2-1.txt", "gs://match2-2.txt", "gs://mismatch2-3.txt"] + }) + for df in [df1, df2]: + df.index = self.SAMPLES_INDEX + return df1, df2 + + def create_null_files_tables(self): + df1 = pd.DataFrame({ + "col1": [np.nan, "gs://match1-2.txt", "gs://match1-3.txt"], + "col2": [np.nan, "gs://match2-2.txt", "gs://mismatch2-3.txt"] + }) + df2 = pd.DataFrame({ + "col1": [np.nan, "gs://match1-2.txt", "gs://match1-3.txt"], + "col2": ["gs://match2-1.txt", np.nan, np.nan] + }) + for df in [df1, df2]: + df.index = self.SAMPLES_INDEX + return df1, df2 + + def test_matching_files_exact_matches(self): + df1, df2 = self.create_matching_files_tables() + self.validator.compare_files(df1, df2) + expected = pd.DataFrame({ + "col1": [True, True, True], + "col2": [True, True, True] + }) + expected.index = self.SAMPLES_INDEX + pd.testing.assert_frame_equal(self.validator.file_exact_matches, expected) + + def test_mismatching_files_exact_matches(self): + df1, df2 = self.create_mismatching_files_tables() + self.validator.compare_files(df1, df2) + expected = pd.DataFrame({ + "col1": [False, False, False], + "col2": [False, False, False] + }) + expected.index = self.SAMPLES_INDEX + pd.testing.assert_frame_equal(self.validator.file_exact_matches, expected) + + def test_mix_matching_files_exact_matches(self): + df1, df2 = self.create_mix_matching_files_tables() + self.validator.compare_files(df1, df2) + expected = pd.DataFrame({ + "col1": [True, True, True], + "col2": [False, True, False] + }) + expected.index = self.SAMPLES_INDEX + pd.testing.assert_frame_equal(self.validator.file_exact_matches, expected) + + def test_null_files_exact_matches(self): + df1, df2 = self.create_null_files_tables() + self.validator.compare_files(df1, df2) + expected = pd.DataFrame({ + "col1": [True, True, True], + "col2": [False, False, False] + }) + expected.index = self.SAMPLES_INDEX + pd.testing.assert_frame_equal(self.validator.file_exact_matches, expected) + + def test_null_files_number_of_differences(self): + df1, df2 = self.create_null_files_tables() + self.validator.compare_files(df1, df2) + self.validator.set_file_number_of_differences() + expected = pd.DataFrame({ + self.validator.NUM_DIFFERENCES_COL: [0, 3] + }) + expected.index = self.COLUMNS_INDEX + pd.testing.assert_frame_equal(self.validator.file_number_of_differences, expected) + + def test_mismatching_files_number_of_differences(self): + df1, df2 = self.create_mismatching_files_tables() + self.validator.compare_files(df1, df2) + self.validator.set_file_number_of_differences() + expected = pd.DataFrame({ + self.validator.NUM_DIFFERENCES_COL: [3, 3] + }) + expected.index = self.COLUMNS_INDEX + pd.testing.assert_frame_equal(self.validator.file_number_of_differences, expected) + + def test_mix_matching_files_number_of_differences(self): + df1, df2 = self.create_mix_matching_files_tables() + self.validator.compare_files(df1, df2) + self.validator.set_file_number_of_differences() + expected = pd.DataFrame({ + self.validator.NUM_DIFFERENCES_COL: [0, 2] + }) + expected.index = self.COLUMNS_INDEX + pd.testing.assert_frame_equal(self.validator.file_number_of_differences, expected) + + def test_null_files_number_of_differences(self): + df1, df2 = self.create_null_files_tables() + self.validator.compare_files(df1, df2) + self.validator.set_file_number_of_differences() + expected = pd.DataFrame({ + self.validator.NUM_DIFFERENCES_COL: [0, 3] + }) + expected.index = self.COLUMNS_INDEX + pd.testing.assert_frame_equal(self.validator.file_number_of_differences, expected) + + def test_matching_files_exact_differences(self): + df1, df2 = self.create_matching_files_tables() + self.validator.compare_files(df1, df2) + expected = pd.DataFrame({ + ("col1", "table1"): [np.nan, np.nan, np.nan], + ("col1", "table2"): [np.nan, np.nan, np.nan], + ("col2", "table1"): [np.nan, np.nan, np.nan], + ("col2", "table2"): [np.nan, np.nan, np.nan] + }).astype(object) + expected.index = self.SAMPLES_INDEX + pd.testing.assert_frame_equal(self.validator.file_exact_differences, expected) + + def test_mismatching_files_exact_differences(self): + df1, df2 = self.create_mismatching_files_tables() + self.validator.compare_files(df1, df2) + expected = pd.DataFrame({ + ("col1", "table1"): ["gs://mismatch1-1.txt", "gs://mismatch1-2.txt", "gs://mismatch1-3.txt"], + ("col1", "table2"): ["gs://mismatch1-1.txt", "gs://mismatch1-2.txt", "gs://mismatch1-3.txt"], + ("col2", "table1"): ["gs://mismatch2-1.txt", "gs://mismatch2-2.txt", "gs://mismatch2-3.txt"], + ("col2", "table2"): ["gs://mismatch2-1.txt", "gs://mismatch2-2.txt", "gs://mismatch2-3.txt"] + }).astype(object) + expected.index = self.SAMPLES_INDEX + pd.testing.assert_frame_equal(self.validator.file_exact_differences, expected) + + def test_mix_matching_files_exact_differences(self): + df1, df2 = self.create_mix_matching_files_tables() + self.validator.compare_files(df1, df2) + expected = pd.DataFrame({ + ("col1", "table1"): [np.nan, np.nan, np.nan], + ("col1", "table2"): [np.nan, np.nan, np.nan], + ("col2", "table1"): ["gs://mismatch2-1.txt", np.nan, "gs://mismatch2-3.txt"], + ("col2", "table2"): ["gs://mismatch2-1.txt", np.nan, "gs://mismatch2-3.txt"] + }).astype(object) + expected.index = self.SAMPLES_INDEX + pd.testing.assert_frame_equal(self.validator.file_exact_differences, expected) + + def test_null_files_exact_differences(self): + df1, df2 = self.create_null_files_tables() + self.validator.compare_files(df1, df2) + expected = pd.DataFrame({ + ("col1", "table1"): [np.nan, np.nan, np.nan], + ("col1", "table2"): [np.nan, np.nan, np.nan], + ("col2", "table1"): [np.nan, "gs://match2-2.txt", "gs://mismatch2-3.txt"], + ("col2", "table2"): ["gs://match2-1.txt", np.nan, np.nan] + }).astype(object) + expected.index = self.SAMPLES_INDEX + pd.testing.assert_frame_equal(self.validator.file_exact_differences, expected) + +class TestValidateFiles(unittest.TestCase): + """ + Test comparing files using the validation criteria. EXACT follows the same + logic as compare_files(), SET should treat files as matching if after + sorting they are identical, IGNORE should "skip" the files. Other criteria + should result in an Exception. + """ + SAMPLES_INDEX = ["sample1", "sample2", "sample3", "sample4", "sample5"] + COLUMNS_INDEX = ["exact_col", "set_col", "ignore_col", "float_col"] + TABLE1_FILE_URIS = ["gs://match1-1.txt", "gs://mismatch1-2.txt", "gs://match1-3", "gs://sortmatch1-1.txt", np.nan] + TABLE2_FILE_URIS = ["gs://match1-1.txt", "gs://mismatch1-2.txt", np.nan, "gs://sortmatch1-1.txt", np.nan] + EXACT_MATCHES_MASK = [True, False, False, False, True] + + def setUp(self): + self.validator = Validator(MockOptions()) + self.validator.validation_criteria = pd.DataFrame({ + "exact_col": "EXACT", + "set_col": "SET", + "ignore_col": "IGNORE", + "float_col": 0.1, + }, index=["column", "criteria"] + ) + + # This numeric convertion is done in Validator init method + self.validator.validation_criteria = (self.validator.validation_criteria + .apply(pd.to_numeric, errors="ignore").convert_dtypes() + ) + + # assign the same URIs to each column, will test that the validation + # results vary depending on the the validation criterion + self.validator.table1 = pd.DataFrame({ + "samples": self.SAMPLES_INDEX, + "exact_col": self.TABLE1_FILE_URIS, + "set_col": self.TABLE1_FILE_URIS, + "ignore_col": self.TABLE1_FILE_URIS, + "float_col": self.TABLE1_FILE_URIS # uh-oh + }) + + self.validator.table2 = pd.DataFrame({ + "samples": self.SAMPLES_INDEX, + "exact_col": self.TABLE2_FILE_URIS, + "set_col": self.TABLE2_FILE_URIS, + "ignore_col": self.TABLE2_FILE_URIS, + "float_col": self.TABLE2_FILE_URIS # uh-oh + }) + + # the exact matches will be identical regardless of validation criteria + self.validator.file_exact_matches = pd.DataFrame({ + "exact_col": self.EXACT_MATCHES_MASK, + "set_col": self.EXACT_MATCHES_MASK, + "ignore_col": self.EXACT_MATCHES_MASK, + "float_col": self.EXACT_MATCHES_MASK + }) + self.validator.file_exact_matches.index = self.SAMPLES_INDEX + + self.validator.file_number_of_differences = pd.DataFrame({ + self.validator.NUM_DIFFERENCES_COL: [3, 3, 3, 3] + }) + self.validator.file_number_of_differences.index = self.COLUMNS_INDEX + + self.validator.table1_name = "table1" + self.validator.table2_name = "table2" + self.validator.table1_files_dir = "tests/table1_files" + self.validator.table2_files_dir = "tests/table2_files" + + self.validator.validation_table = pd.DataFrame() + + def test_validate_exact(self): + column = self.validator.validation_criteria["exact_col"] + observed = self.validator.validate_files(column) + expected = ("EXACT", 3) + self.assertEqual(observed, expected) + + def test_validate_ignore(self): + column = self.validator.validation_criteria["ignore_col"] + observed = self.validator.validate_files(column) + expected = ("IGNORE", 0) + self.assertEqual(observed, expected) + + def test_validate_set(self): + column = self.validator.validation_criteria["set_col"] + observed = self.validator.validate_files(column) + expected = ("SET", 2) # sorted file should not count as different + self.assertEqual(observed, expected) + + def test_validate_float(self): + # have not implemented % difference for files + column = self.validator.validation_criteria["set_col"] + self.assertRaises(Exception, self.validator.validate_files(column)) + + def test_validation_table(self): + for column in ["exact_col", "set_col", "ignore_col"]: + column = self.validator.validation_criteria[column] + self.validator.validate_files(column) + + # these steps are done in run_validation_checks + self.validator.validation_table.set_index(self.validator.table1["samples"], inplace=True) + self.validator.validation_table.rename_axis(None, axis="index", inplace=True) + self.validator.validation_table.columns = pd.MultiIndex.from_tuples( + self.validator.validation_table.columns, names=["Column", "Table"] + ) + + # exact_col should count sortmatch file as a mismatch, while set_col should + # count it as a match. + # no column should be generated for ignore_col. + expected = pd.DataFrame({ + ("exact_col", "table1"): [np.nan, "gs://mismatch1-2.txt", "gs://match1-3", "gs://sortmatch1-1.txt", np.nan], + ("exact_col", "table2"): [np.nan, "gs://mismatch1-2.txt", np.nan, "gs://sortmatch1-1.txt", np.nan], + ("set_col", "table1"): [np.nan, "gs://mismatch1-2.txt", "gs://match1-3", np.nan, np.nan], + ("set_col", "table2"): [np.nan, "gs://mismatch1-2.txt", np.nan, np.nan, np.nan], + }) + expected.set_index(self.validator.table1["samples"], inplace=True) + expected.rename_axis(None, axis="index", inplace=True) + expected.columns = pd.MultiIndex.from_tuples(expected.columns, names=["Column", "Table"]) + pd.testing.assert_frame_equal(self.validator.validation_table, expected) diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py index c0dda71..5c66144 100644 --- a/theiavalidate/Validator.py +++ b/theiavalidate/Validator.py @@ -1,16 +1,21 @@ from datetime import date from pretty_html_table import build_table + +import difflib +import filecmp import logging import numpy as np import os import pandas as pd import pdfkit as pdf +import subprocess import sys class Validator: """ This class runs the parsing module for theiavalidate """ + NUM_DIFFERENCES_COL = "Number of differences (exact match)" def __init__(self, options): logging.basicConfig(encoding='utf-8', level=logging.ERROR, stream=sys.stderr) self.logger = logging.getLogger(__name__) @@ -38,7 +43,18 @@ def __init__(self, options): self.validation_criteria = options.validation_criteria self.columns_to_compare = options.columns_to_compare self.columns_to_compare.append("samples") - + + self.file_columns = set() # columns that contain GCP URIs to files + self.table1_files_dir = "table1_files" + self.table2_files_dir = "table2_files" + self.diff_dir = "file_diffs" + + # DataFrames for holding file comparison results + self.file_exact_matches = None + self.file_exact_differences = None + self.file_number_of_differences = None + self.file_validations = None + self.output_prefix = options.output_prefix self.na_values = options.na_values @@ -134,35 +150,158 @@ def count_populated_cells(self): self.logger.debug("Creating the summary table with the number of populated cells") self.summary_output = pd.concat([table1_populated_rows, table2_populated_rows], join="outer", axis=1) + + def determine_file_columns(self): + """ + Determine the columns with GCP URIs so that they are excluded from regular + comparisons and instead file comparisons are performed. + """ + for df in [self.table1, self.table2]: + # select columns with at least one GCP URI among nulls + file_columns = df.columns[(df.apply(lambda x: x.astype(str).str.startswith("gs://") + | x.isnull()).all()) + & (~df.isnull().all())] + + file_columns = file_columns.tolist() + self.file_columns.update(file_columns) + + # Ensure file_columns set only has GCP URIs and nulls + for df in [self.table1, self.table2]: + remove_columns = df.columns[~(df.apply(lambda x: x.astype(str).str.startswith('gs://') + | x.isnull()).all())] + + # Convert the Index object to a set + remove_columns = set(remove_columns.tolist()) + self.file_columns = self.file_columns - remove_columns + """ This function performs an exact match and creates and Excel file that contains the exact match differences """ def perform_exact_match(self): self.logger.debug("Performing an exact match and removing the sample name column") + + if self.file_columns: + # exclude file_columns for string comparison + table1 = self.table1.drop(list(self.file_columns), axis=1) + table2 = self.table2.drop(list(self.file_columns), axis=1) + + # handle file comparisons separately from strings + # TODO: set index to samples column in main table earlier? + files_df1 = self.table1.set_index("samples") + files_df2 = self.table2.set_index("samples") + files_df1 = files_df1[list(self.file_columns)] + files_df2 = files_df2[list(self.file_columns)] + self.compare_files(files_df1, files_df2) + self.set_file_number_of_differences() + else: + table1 = self.table1 + table2 = self.table2 + # count the number of differences using exact string matches # temporarily make NaNs null since NaN != NaN for the pd.DataFrame.eq() function # also: remove the samplename row - number_of_differences = pd.DataFrame((~self.table1.fillna("NULL").astype(str).eq(self.table2.fillna("NULL").astype(str))).sum(), columns = ["Number of differences (exact match)"]) + number_of_differences = pd.DataFrame((~table1.fillna("NULL").astype(str).eq(table2.fillna("NULL").astype(str))).sum(), columns = [self.NUM_DIFFERENCES_COL]) + number_of_differences.drop("samples", axis=0, inplace=True) + # add the number of differences to the summary output table self.logger.debug("Adding the number of exact match differences to the summary table") self.summary_output = pd.concat([self.summary_output, number_of_differences], join="outer", axis=1) - + + if self.file_number_of_differences is not None: + self.summary_output = self.summary_output.combine_first(self.file_number_of_differences) + self.summary_output[self.NUM_DIFFERENCES_COL] = self.summary_output[self.NUM_DIFFERENCES_COL].astype(int) + + # ensure number of differences column is the last column + self.summary_output[self.NUM_DIFFERENCES_COL] = self.summary_output.pop(self.NUM_DIFFERENCES_COL) + # get a table of self-other differences # also: temporarily drop the sample name column for comparison and then set it as the index for the output data frame self.logger.debug("Creating a table of self-other differences") - exact_differences_table = self.table1.drop("samples", axis=1).compare(self.table2.drop("samples", axis=1), keep_shape=True).set_index(self.table1["samples"]) + exact_differences_table = table1.drop("samples", axis=1).compare(table2.drop("samples", axis=1), keep_shape=True).set_index(table1["samples"]) # rename the self and other with the table names self.logger.debug("Renaming the self and other to be the table names") exact_differences_table.rename(columns={"self": self.table1_name, "other": self.table2_name}, level=-1, inplace=True) + + # add file exact differences + exact_differences_table = pd.concat([exact_differences_table, self.file_exact_differences], axis=1) + # replace matching values (NAs) with blanks self.logger.debug("Replacing all NA values with blanks") exact_differences_table.replace(np.nan, "", inplace=True) + self.logger.debug("Writing the self-other differences table to a TSV file") exact_differences_table.to_csv(self.output_prefix + "_exact_differences.tsv", sep="\t", index=True) + + def compare_files(self, file_df1, file_df2): + """ + Determine which pairs of files referenced in the DataFrames are identical + """ + self.file_exact_matches = pd.DataFrame(index=file_df1.index, + columns=file_df1.columns) + + # create similar table to one generated by df1.compare(df2) + # for adding to the exact differences TSV + self.file_exact_differences = pd.DataFrame( + index=file_df1.index, + columns=pd.MultiIndex.from_product([file_df1.columns, [self.table1_name, self.table2_name]]) + ) + + for col in file_df1.columns: + for row in file_df1.index: + uri1 = file_df1.loc[row, col] + uri2 = file_df2.loc[row, col] + if pd.isnull(uri1) and pd.isnull(uri2): + # count two nulls as matching + self.file_exact_matches.loc[row, col] = True + elif (not pd.isnull(uri1) and not pd.isnull(uri2)): + file1 = os.path.join(self.table1_files_dir, uri1.removeprefix("gs://")) + file2 = os.path.join(self.table2_files_dir, uri2.removeprefix("gs://")) + is_match = filecmp.cmp(file1, file2, shallow=False) + self.file_exact_matches.loc[row, col] = is_match + if is_match: + # don't add URIs to exact differences table if files match + self.file_exact_differences.loc[row, (col, self.table1_name)] = np.nan + self.file_exact_differences.loc[row, (col, self.table2_name)] = np.nan + continue + else: + output_filename = f"{row}_{col}_diff.txt" + output_path = os.path.join(self.diff_dir, output_filename) + self.create_diff(file1, file2, output_path) + else: + # count as not matching if pair is missing + self.file_exact_matches.loc[row, col] = False + + self.file_exact_differences.loc[row, (col, self.table1_name)] = uri1 + self.file_exact_differences.loc[row, (col, self.table2_name)] = uri2 + + self.file_exact_matches = self.file_exact_matches.astype(bool) + + def set_file_number_of_differences(self): + self.file_number_of_differences = pd.DataFrame(columns=[self.NUM_DIFFERENCES_COL]) + for col in self.file_exact_matches.columns: + count = self.file_exact_matches[col].dropna().ne(True).sum() + self.file_number_of_differences.loc[col] = count + + def create_diff(self, file1, file2, output_path): + # create unified diff + with open(file1, "r") as f1, open(file2, "r") as f2: + diff = difflib.unified_diff( + f1.readlines(), + f2.readlines(), + fromfile=file1, + tofile=file2, + lineterm='', + ) + diff = "".join(diff) + + os.makedirs(os.path.dirname(output_path), exist_ok=True) + with open(output_path, "w") as out: + out.write(diff) + """ This function calculates the percent difference between two values """ @@ -180,14 +319,17 @@ def percent_difference(self, value1, value2): def validate(self, column): if column.name in self.table1.columns: # check the data type of the validation criteria; based on its type, we can assume the comparison to perform - if pd.api.types.is_string_dtype(column) == True: # if a string + if column.name in self.file_columns: + # handle file validation separately from strings, floats + validation_criterion, number_of_differences = self.validate_files(column) + return (validation_criterion, number_of_differences) + elif pd.api.types.is_string_dtype(column) == True: # if a string if column[0] == "EXACT": # count the number of exact match failures/differences self.logger.debug("Performing an exact match on column {} and counting the number of differences".format(column.name)) exact_matches = ~self.table1[column.name].fillna("NULL").eq(self.table2[column.name].fillna("NULL")) self.validation_table[(column.name, self.table1_name)] = self.table1[column.name].where(exact_matches) self.validation_table[(column.name, self.table2_name)] = self.table2[column.name].where(exact_matches) - number_of_differences = exact_matches.sum() return ("EXACT", number_of_differences) elif column[0] == "IGNORE": # do not check; there are no failures (0) @@ -223,21 +365,80 @@ def validate(self, column): else: self.logger.debug("Column {} was not found; indicating np.nan failures".format(column.name)) return ("COLUMN " + column.name + " NOT FOUND", np.nan) + + def validate_files(self, column): + """ + Perform validation of matching file contents based on which of EXACT, + IGNORE, or SET is assigned as the column's validation criterion. For SET, + sort lines in file before comparing. + """ + validation_criterion = column.iloc[0] + if validation_criterion == "EXACT": + # we already know where the exact matches are from compare_files() + self.validation_table[(column.name, self.table1_name)] = (self.table1 + .set_index("samples")[column.name] + .where(~self.file_exact_matches[column.name]) + .reset_index()[column.name] + ) + self.validation_table[(column.name, self.table2_name)] = (self.table2 + .set_index("samples")[column.name] + .where(~self.file_exact_matches[column.name]) + .reset_index()[column.name] + ) + number_of_differences = self.file_number_of_differences.loc[column.name, self.NUM_DIFFERENCES_COL] + elif validation_criterion == "IGNORE": + number_of_differences = 0 + elif validation_criterion == "SET": + # for SET, sort lines in files then compare + concat_columns = pd.concat([self.table1[column.name], self.table2[column.name]], axis=1) + concat_columns = concat_columns.applymap( + lambda x: x.removeprefix("gs://") if pd.notnull(x) else x + ) + sorted_file_matches = concat_columns.apply(self.compare_sorted_files, axis=1) + self.validation_table[(column.name, self.table1_name)] = (self.table1[column.name] + .where(~sorted_file_matches) + ) + self.validation_table[(column.name, self.table2_name)] = (self.table2[column.name] + .where(~sorted_file_matches) + ) + number_of_differences = len(sorted_file_matches) - sorted_file_matches.sum() + else: + raise Exception("Only EXACT, IGNORE, and SET validation criteria implemented for file columns") + return (validation_criterion, number_of_differences) + + def compare_sorted_files(self, row): + """ + Compare two files sorted alphabetically by line for a pair of file URIs. + """ + file1 = row.iloc[0] + file2 = row.iloc[1] + if pd.isnull(file1) and pd.isnull(file2): + # count two nulls as matching + return True + if pd.notnull(file1) and pd.notnull(file2): + file1 = os.path.join(self.table1_files_dir, file1) + file2 = os.path.join(self.table2_files_dir, file2) + with open(file1, "r") as f1, open(file2, "r") as f2: + lines1 = f1.readlines() + lines2 = f2.readlines() + lines1.sort() + lines2.sort() + return lines1 == lines2 + # count null + not-null as mismatching + return False """ This function creates, formats, and runs the validation criteria checks - """ + """ def run_validation_checks(self): self.validation_table = pd.DataFrame() self.logger.debug("Performing the validation checks") self.summary_output[["Validation Criteria", "Number of samples failing the validation criteria"]] = pd.DataFrame(self.validation_criteria.apply(lambda x: self.validate(x), result_type="expand")).transpose() - # format the validation criteria differences table self.logger.debug("Formatting the validation criteria differences table") self.validation_table.set_index(self.table1["samples"], inplace=True) self.validation_table.rename_axis(None, axis="index", inplace=True) - self.validation_table.transpose() self.validation_table.columns = pd.MultiIndex.from_tuples(self.validation_table.columns, names=["Column", "Table"]) @@ -317,6 +518,18 @@ def compare(self): self.logger.info("Counting how many cells have values") self.count_populated_cells() + + self.logger.info("Determining columns for file comparisons") + self.determine_file_columns() + + dir1 = f"{self.table1_files_dir}/" + dir2 = f"{self.table2_files_dir}/" + os.mkdir(dir1) + os.mkdir(dir2) + + self.logger.info("Localizing files to compare...") + self.table1[list(self.file_columns)].apply(localize_files, directory=dir1) + self.table2[list(self.file_columns)].apply(localize_files, directory=dir2) self.logger.info("Performing an exact string match") self.perform_exact_match() @@ -329,4 +542,17 @@ def compare(self): self.make_pdf_report() self.logger.info("Done!") - \ No newline at end of file + +def localize_files(row, directory): + """ + Download files to compare from GCP. + """ + for value in row: + if isinstance(value, str) and value.startswith("gs://"): + # it would be much faster to copy files all at once, but any files with + # the same name would be clobbered, so create local directories matching + # gsutil path and loop to copy + remote_path = os.path.dirname(value.removeprefix("gs://")) + destination_path = os.path.join(directory, remote_path) + os.makedirs(destination_path, exist_ok=True) + subprocess.run(["gsutil", "-m", "cp", value, destination_path]) diff --git a/theiavalidate/__init__.py b/theiavalidate/__init__.py index 9a65ac3..c0986f5 100644 --- a/theiavalidate/__init__.py +++ b/theiavalidate/__init__.py @@ -1 +1,2 @@ -__VERSION__ = "v0.0.1" \ No newline at end of file +__VERSION__ = "v0.0.1" +import os, sys; sys.path.append(os.path.dirname(os.path.realpath(__file__))) \ No newline at end of file diff --git a/theiavalidate/theiavalidate.py b/theiavalidate/theiavalidate.py index bbdc82d..c04ae3b 100644 --- a/theiavalidate/theiavalidate.py +++ b/theiavalidate/theiavalidate.py @@ -5,6 +5,11 @@ from __init__ import __VERSION__ from Validator import Validator +DEFAULT_NA_VALUES = [ + '-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'n/a', + '', '#NA', 'NULL', 'null', 'NaN','-NaN', 'nan', '-nan', 'None' +] + def main(): parser = argparse.ArgumentParser( description = "This tool compares two tab-delimited files and outputs a report of the differences between the two files.", @@ -25,8 +30,8 @@ def main(): parser.add_argument("-o", "--output_prefix", help="the output file name prefix\ndo not include any spaces", default="theiavalidate", metavar="\b") parser.add_argument("-n", "--na_values", - help="the values that should be considered NA\ndefault values = ['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'n/a', '', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', 'None']", - default= ['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'n/a', '', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', 'None'], metavar="\b", type=int) + help=f"the values that should be considered NA\ndefault values = {DEFAULT_NA_VALUES}", + default=DEFAULT_NA_VALUES, metavar="\b", type=int) parser.add_argument("--verbose", help="increase stdout verbosity", action="store_true", default=False) parser.add_argument("--debug",