diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000..51227d7
Binary files /dev/null and b/.DS_Store differ
diff --git a/.gitignore b/.gitignore
index cb5fc37..08a404d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -160,4 +160,9 @@ cython_debug/
 #.idea/
 
 # IDE
-.vscode/
\ No newline at end of file
+.vscode/
+.devcontainer
+
+# testing files
+sandbox/
+file_diffs/
\ No newline at end of file
diff --git a/README.md b/README.md
index 2a09ae8..30d5bc7 100644
--- a/README.md
+++ b/README.md
@@ -72,16 +72,15 @@ column2         SET
 column3         0.01
 ```
 
-Currently implmented validation criteria include:
+Currently implemented validation criteria include:
 
 | validation_criteria | explanation |
 | --- | --- |
-| EXACT | the values in the two columns must be exactly the same; in this case `[foo,bar] != [bar,foo]` |
-| SET | the values in the two columns must be the same set of values; in this case `[foo,bar] == [bar,foo]` |
-| \<FLOAT\> | the values in the two columns must be within `<FLOAT>*100` of each other; e.g., 0.3 -> 30% difference allowed |
-| IGNORE | the values in the two columns are assumed to match; in this case `foo == bar` |
+| EXACT | The values in the two columns must be exactly the same; in this case `[foo,bar] != [bar,foo]`. When applied to columns referencing files, file contents will be compared to check if they are identical.|
+| SET | The values in the two columns must be the same set of values; in this case `[foo,bar] == [bar,foo]`. When applied to columns referencing files, the lines within the files will be sorted alphabetically before comparing.|
+| \<FLOAT\> | The values in the two columns must be within `<FLOAT>*100` of each other; e.g., 0.3 -> 30% difference allowed. |
+| IGNORE | The values in the two columns are assumed to match; in this case `foo == bar`. |
 
-Future comparisons to include `FILE-EXACT`, `FILE-SET`, `FILE-<FLOAT>`.
 
 #### Optional: `column_translation`
 
@@ -149,3 +148,6 @@ This file (available as an HTML and PDF) is a summary of the differences between
   - the number of samples failing the validation criteria
 
 If a `validation_criteria.tsv` file was provided, a definition of the (currently implemented) validation criteria are provided at the bottom of the table
+
+#### `<sample>_<column>_diff.txt`
+Shows the differing lines within mismatching files for a given sample and column. Each pair of mismatching files generates a separate file.
\ No newline at end of file
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000..c0986f5
--- /dev/null
+++ b/__init__.py
@@ -0,0 +1,2 @@
+__VERSION__ = "v0.0.1"
+import os, sys; sys.path.append(os.path.dirname(os.path.realpath(__file__)))
\ No newline at end of file
diff --git a/examples/file_comparison/example-validation_criteria_exact_sort_file.tsv b/examples/file_comparison/example-validation_criteria_exact_sort_file.tsv
new file mode 100644
index 0000000..1590aa0
--- /dev/null
+++ b/examples/file_comparison/example-validation_criteria_exact_sort_file.tsv
@@ -0,0 +1,7 @@
+column	criteria
+assembly_length	0.01
+gambit_predicted_taxon	EXACT
+amrfinderplus_amr_core_genes	SET
+extra_column	IGNORE
+file_column	EXACT
+sort_file_column	SET
diff --git a/examples/file_comparison/file_comparison_column_translation.tsv b/examples/file_comparison/file_comparison_column_translation.tsv
new file mode 100644
index 0000000..3cf7192
--- /dev/null
+++ b/examples/file_comparison/file_comparison_column_translation.tsv
@@ -0,0 +1,2 @@
+amrfinderplus_amr_genes	amrfinderplus_amr_core_genes
+extra_column2	extra_column
\ No newline at end of file
diff --git a/examples/file_comparison/file_comparison_columns_to_compare.txt b/examples/file_comparison/file_comparison_columns_to_compare.txt
new file mode 100644
index 0000000..d67db40
--- /dev/null
+++ b/examples/file_comparison/file_comparison_columns_to_compare.txt
@@ -0,0 +1 @@
+"assembly_length,gambit_predicted_taxon,amrfinderplus_amr_core_genes,extra_column,file_column,sort_file_column"
\ No newline at end of file
diff --git a/examples/file_comparison/file_comparison_table1.tsv b/examples/file_comparison/file_comparison_table1.tsv
new file mode 100644
index 0000000..1d42049
--- /dev/null
+++ b/examples/file_comparison/file_comparison_table1.tsv
@@ -0,0 +1,6 @@
+entity:table1_with_files_id	amrfinderplus_amr_core_genes	assembly_length	extra_column	file_column	gambit_predicted_taxon	sort_file_column
+sample01	tet(A),aph(6)-Id,aph(3'')-Ib	4783605	extra_value	gs://path/to/table1_files/match1-1.txt	Salmonella enterica	gs://path/to/table1_files/match1-1.txt
+sample02	glpT_E448K,gyrA_D87G,gyrA_S83L,sat2,dfrA1,parC_S80I,blaCTX-M-27	5226301		gs://path/to/table1_files/mismatch1-1.txt	Shigella sonnei	gs://path/to/table1_files/mismatch1-1.txt
+sample03		4719410	extra_value	gs://path/to/table1_files/mismatch2-1.txt	Shigella	gs://path/to/table1_files/sortmatch1-1.txt
+sample04	sul1,aadA7,parC_S87L,gyrA_T83I	6674526		gs://path/to/table1_files/mismatch2-1.txt	Pseudomonas aeruginosa	gs://path/to/table1_files/mismatch1-1.txt
+sample05	parC_S80Y,tet(38),mecR1,murA_G257D,fosB,gyrA_S84L,mecA	2773544			Staphylococcus aureus	
diff --git a/examples/file_comparison/file_comparison_table2.tsv b/examples/file_comparison/file_comparison_table2.tsv
new file mode 100644
index 0000000..0e39e38
--- /dev/null
+++ b/examples/file_comparison/file_comparison_table2.tsv
@@ -0,0 +1,6 @@
+entity:table2_with_files_id	amrfinderplus_amr_genes	assembly_length	extra_column2	file_column	gambit_predicted_taxon	sort_file_column
+sample01	aph(3'')-Ib,aph(6)-Id,tet(A)	4783610	extra_value	gs://path/to/table2_files/match1-1.txt	Salmonella enterica	gs://path/to/table2_files/match1-1.txt
+sample02	glpT_E448K,gyrA_D87G,gyrA_S83L,parC_S80I,blaCTX-M-27,sat2,dfrA1	5274928		gs://path/to/table2_files/mismatch1-1.txt	Shigella sonnei	gs://path/to/table2_files/mismatch1-1.txt
+sample03	glpT_E448K,gyrA_D87G,gyrA_S83L,sat2	5287603		gs://path/to/table2_files/mismatch2-1.txt	Shigella sonnei	gs://path/to/table2_files/sortmatch1-1.txt
+sample04	parC_S87L,gyrA_T83I,sul1,aadA7	6674503	extra_value		Pseudomonas aeruginosa	
+sample05	parC_S80Y,tet(38),fosB,gyrA_S84L,mecA,mecR1,murA_G257D	2771914			Staphylococcus aureus	
diff --git a/examples/file_comparison/outputs/diffs/sample02_file_column_diff.txt b/examples/file_comparison/outputs/diffs/sample02_file_column_diff.txt
new file mode 100644
index 0000000..c6aa9ad
--- /dev/null
+++ b/examples/file_comparison/outputs/diffs/sample02_file_column_diff.txt
@@ -0,0 +1,5 @@
+--- table1_files/path/to/table1_files/mismatch1-1.txt+++ table2_files/path/to/table2_files/mismatch1-1.txt@@ -1,3 +1,3 @@-foo
+-bar
++eggs
++spam
+ 
diff --git a/examples/file_comparison/outputs/diffs/sample02_sort_file_column_diff.txt b/examples/file_comparison/outputs/diffs/sample02_sort_file_column_diff.txt
new file mode 100644
index 0000000..c6aa9ad
--- /dev/null
+++ b/examples/file_comparison/outputs/diffs/sample02_sort_file_column_diff.txt
@@ -0,0 +1,5 @@
+--- table1_files/path/to/table1_files/mismatch1-1.txt+++ table2_files/path/to/table2_files/mismatch1-1.txt@@ -1,3 +1,3 @@-foo
+-bar
++eggs
++spam
+ 
diff --git a/examples/file_comparison/outputs/diffs/sample03_file_column_diff.txt b/examples/file_comparison/outputs/diffs/sample03_file_column_diff.txt
new file mode 100644
index 0000000..aebe16f
--- /dev/null
+++ b/examples/file_comparison/outputs/diffs/sample03_file_column_diff.txt
@@ -0,0 +1,3 @@
+--- table1_files/path/to/table1_files/mismatch2-1.txt+++ table2_files/path/to/table2_files/mismatch2-1.txt@@ -1,2 +1 @@-1 2 3
+-
++1 2
diff --git a/examples/file_comparison/outputs/diffs/sample03_sort_file_column_diff.txt b/examples/file_comparison/outputs/diffs/sample03_sort_file_column_diff.txt
new file mode 100644
index 0000000..fad4e1e
--- /dev/null
+++ b/examples/file_comparison/outputs/diffs/sample03_sort_file_column_diff.txt
@@ -0,0 +1,4 @@
+--- table1_files/path/to/table1_files/sortmatch1-1.txt+++ table2_files/path/to/table2_files/sortmatch1-1.txt@@ -1,3 +1,3 @@+baz
+ foo
+ bar
+-baz
diff --git a/examples/file_comparison/outputs/file_comparison_exact_differences.tsv b/examples/file_comparison/outputs/file_comparison_exact_differences.tsv
new file mode 100644
index 0000000..9e07948
--- /dev/null
+++ b/examples/file_comparison/outputs/file_comparison_exact_differences.tsv
@@ -0,0 +1,8 @@
+	amrfinderplus_amr_core_genes	amrfinderplus_amr_core_genes	assembly_length	assembly_length	extra_column	extra_column	gambit_predicted_taxon	gambit_predicted_taxon	sort_file_column	sort_file_column	file_column	file_column
+	table1_with_files.tsv	table2_with_files.tsv	table1_with_files.tsv	table2_with_files.tsv	table1_with_files.tsv	table2_with_files.tsv	table1_with_files.tsv	table2_with_files.tsv	table1_with_files.tsv	table2_with_files.tsv	table1_with_files.tsv	table2_with_files.tsv
+samples												
+sample01	tet(A),aph(6)-Id,aph(3'')-Ib	aph(3'')-Ib,aph(6)-Id,tet(A)	4783605	4783610								
+sample02	glpT_E448K,gyrA_D87G,gyrA_S83L,sat2,dfrA1,parC_S80I,blaCTX-M-27	glpT_E448K,gyrA_D87G,gyrA_S83L,parC_S80I,blaCTX-M-27,sat2,dfrA1	5226301	5274928					gs://path/to/table1_files/mismatch1-1.txt	gs://path/to/table2_files/mismatch1-1.txt	gs://path/to/table1_files/mismatch1-1.txt	gs://path/to/table2_files/mismatch1-1.txt
+sample03		glpT_E448K,gyrA_D87G,gyrA_S83L,sat2	4719410	5287603	extra_value		Shigella	Shigella sonnei	gs://path/to/table1_files/sortmatch1-1.txt	gs://path/to/table2_files/sortmatch1-1.txt	gs://path/to/table1_files/mismatch2-1.txt	gs://path/to/table2_files/mismatch2-1.txt
+sample04	sul1,aadA7,parC_S87L,gyrA_T83I	parC_S87L,gyrA_T83I,sul1,aadA7	6674526	6674503		extra_value			gs://path/to/table1_files/mismatch1-1.txt		gs://path/to/table1_files/mismatch2-1.txt	
+sample05	parC_S80Y,tet(38),mecR1,murA_G257D,fosB,gyrA_S84L,mecA	parC_S80Y,tet(38),fosB,gyrA_S84L,mecA,mecR1,murA_G257D	2773544	2771914								
diff --git a/examples/file_comparison/outputs/file_comparison_summary.pdf b/examples/file_comparison/outputs/file_comparison_summary.pdf
new file mode 100644
index 0000000..f36eb86
Binary files /dev/null and b/examples/file_comparison/outputs/file_comparison_summary.pdf differ
diff --git a/examples/file_comparison/outputs/file_comparison_validation_criteria_differences (2).tsv b/examples/file_comparison/outputs/file_comparison_validation_criteria_differences (2).tsv
new file mode 100644
index 0000000..8dd9d37
--- /dev/null
+++ b/examples/file_comparison/outputs/file_comparison_validation_criteria_differences (2).tsv	
@@ -0,0 +1,7 @@
+Column	assembly_length	assembly_length	gambit_predicted_taxon	gambit_predicted_taxon	amrfinderplus_amr_core_genes	amrfinderplus_amr_core_genes	file_column	file_column	sort_file_column	sort_file_column
+Table	table1_with_files.tsv	table2_with_files.tsv	table1_with_files.tsv	table2_with_files.tsv	table1_with_files.tsv	table2_with_files.tsv	table1_with_files.tsv	table2_with_files.tsv	table1_with_files.tsv	table2_with_files.tsv
+sample01										
+sample02							gs://path/to/table1_files/mismatch1-1.txt	gs://path/to/table2_files/mismatch1-1.txt	gs://path/to/table1_files/mismatch1-1.txt	gs://path/to/table2_files/mismatch1-1.txt
+sample03	4719410.0	5287603.0	Shigella	Shigella sonnei		glpT_E448K,gyrA_D87G,gyrA_S83L,sat2	gs://path/to/table1_files/mismatch2-1.txt	gs://path/to/table2_files/mismatch2-1.txt		
+sample04							gs://path/to/table1_files/mismatch2-1.txt		gs://path/to/table1_files/mismatch1-1.txt	
+sample05										
diff --git a/examples/file_comparison/outputs/filtered_file_comparison_table1.tsv b/examples/file_comparison/outputs/filtered_file_comparison_table1.tsv
new file mode 100644
index 0000000..ad16c0e
--- /dev/null
+++ b/examples/file_comparison/outputs/filtered_file_comparison_table1.tsv
@@ -0,0 +1,6 @@
+samples	amrfinderplus_amr_core_genes	assembly_length	extra_column	file_column	gambit_predicted_taxon	sort_file_column
+sample01	tet(A),aph(6)-Id,aph(3'')-Ib	4783605	extra_value	gs://path/to/table1_files/match1-1.txt	Salmonella enterica	gs://path/to/table1_files/match1-1.txt
+sample02	glpT_E448K,gyrA_D87G,gyrA_S83L,sat2,dfrA1,parC_S80I,blaCTX-M-27	5226301		gs://path/to/table1_files/mismatch1-1.txt	Shigella sonnei	gs://path/to/table1_files/mismatch1-1.txt
+sample03		4719410	extra_value	gs://path/to/table1_files/mismatch2-1.txt	Shigella	gs://path/to/table1_files/sortmatch1-1.txt
+sample04	sul1,aadA7,parC_S87L,gyrA_T83I	6674526		gs://path/to/table1_files/mismatch2-1.txt	Pseudomonas aeruginosa	gs://path/to/table1_files/mismatch1-1.txt
+sample05	parC_S80Y,tet(38),mecR1,murA_G257D,fosB,gyrA_S84L,mecA	2773544			Staphylococcus aureus	
diff --git a/examples/file_comparison/outputs/filtered_file_comparison_table2.tsv b/examples/file_comparison/outputs/filtered_file_comparison_table2.tsv
new file mode 100644
index 0000000..dfc036f
--- /dev/null
+++ b/examples/file_comparison/outputs/filtered_file_comparison_table2.tsv
@@ -0,0 +1,6 @@
+samples	amrfinderplus_amr_core_genes	assembly_length	extra_column	file_column	gambit_predicted_taxon	sort_file_column
+sample01	aph(3'')-Ib,aph(6)-Id,tet(A)	4783610	extra_value	gs://path/to/table2_files/match1-1.txt	Salmonella enterica	gs://path/to/table2_files/match1-1.txt
+sample02	glpT_E448K,gyrA_D87G,gyrA_S83L,parC_S80I,blaCTX-M-27,sat2,dfrA1	5274928		gs://path/to/table2_files/mismatch1-1.txt	Shigella sonnei	gs://path/to/table2_files/mismatch1-1.txt
+sample03	glpT_E448K,gyrA_D87G,gyrA_S83L,sat2	5287603		gs://path/to/table2_files/mismatch2-1.txt	Shigella sonnei	gs://path/to/table2_files/sortmatch1-1.txt
+sample04	parC_S87L,gyrA_T83I,sul1,aadA7	6674503	extra_value		Pseudomonas aeruginosa	
+sample05	parC_S80Y,tet(38),fosB,gyrA_S84L,mecA,mecR1,murA_G257D	2771914			Staphylococcus aureus	
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..c0986f5
--- /dev/null
+++ b/tests/__init__.py
@@ -0,0 +1,2 @@
+__VERSION__ = "v0.0.1"
+import os, sys; sys.path.append(os.path.dirname(os.path.realpath(__file__)))
\ No newline at end of file
diff --git a/tests/table1_files/match1-1.txt b/tests/table1_files/match1-1.txt
new file mode 100644
index 0000000..7bd112e
--- /dev/null
+++ b/tests/table1_files/match1-1.txt
@@ -0,0 +1,3 @@
+foo
+bar
+
diff --git a/tests/table1_files/match1-2.txt b/tests/table1_files/match1-2.txt
new file mode 100644
index 0000000..42f0295
--- /dev/null
+++ b/tests/table1_files/match1-2.txt
@@ -0,0 +1,3 @@
+baz
+eggs
+
diff --git a/tests/table1_files/match1-3.txt b/tests/table1_files/match1-3.txt
new file mode 100644
index 0000000..fe05684
--- /dev/null
+++ b/tests/table1_files/match1-3.txt
@@ -0,0 +1,3 @@
+spam
+monty
+
diff --git a/tests/table1_files/match2-1.txt b/tests/table1_files/match2-1.txt
new file mode 100644
index 0000000..2b70035
--- /dev/null
+++ b/tests/table1_files/match2-1.txt
@@ -0,0 +1,2 @@
+1 2 3
+
diff --git a/tests/table1_files/match2-2.txt b/tests/table1_files/match2-2.txt
new file mode 100644
index 0000000..8db5eef
--- /dev/null
+++ b/tests/table1_files/match2-2.txt
@@ -0,0 +1,2 @@
+4 5 6
+
diff --git a/tests/table1_files/match2-3.txt b/tests/table1_files/match2-3.txt
new file mode 100644
index 0000000..ee64adb
--- /dev/null
+++ b/tests/table1_files/match2-3.txt
@@ -0,0 +1,2 @@
+7 8 9
+
diff --git a/tests/table1_files/mismatch1-1.txt b/tests/table1_files/mismatch1-1.txt
new file mode 100644
index 0000000..7bd112e
--- /dev/null
+++ b/tests/table1_files/mismatch1-1.txt
@@ -0,0 +1,3 @@
+foo
+bar
+
diff --git a/tests/table1_files/mismatch1-2.txt b/tests/table1_files/mismatch1-2.txt
new file mode 100644
index 0000000..75d7bfb
--- /dev/null
+++ b/tests/table1_files/mismatch1-2.txt
@@ -0,0 +1,2 @@
+foo
+
diff --git a/tests/table1_files/mismatch1-3.txt b/tests/table1_files/mismatch1-3.txt
new file mode 100644
index 0000000..d86174f
--- /dev/null
+++ b/tests/table1_files/mismatch1-3.txt
@@ -0,0 +1,4 @@
+
+spam
+eggs
+
diff --git a/tests/table1_files/mismatch2-1.txt b/tests/table1_files/mismatch2-1.txt
new file mode 100644
index 0000000..2b70035
--- /dev/null
+++ b/tests/table1_files/mismatch2-1.txt
@@ -0,0 +1,2 @@
+1 2 3
+
diff --git a/tests/table1_files/mismatch2-2.txt b/tests/table1_files/mismatch2-2.txt
new file mode 100644
index 0000000..a28f8ae
--- /dev/null
+++ b/tests/table1_files/mismatch2-2.txt
@@ -0,0 +1,2 @@
+5 6 6
+
diff --git a/tests/table1_files/mismatch2-3.txt b/tests/table1_files/mismatch2-3.txt
new file mode 100644
index 0000000..ae0e511
--- /dev/null
+++ b/tests/table1_files/mismatch2-3.txt
@@ -0,0 +1,2 @@
+hello, world
+
diff --git a/tests/table1_files/sortmatch1-1.txt b/tests/table1_files/sortmatch1-1.txt
new file mode 100644
index 0000000..86e041d
--- /dev/null
+++ b/tests/table1_files/sortmatch1-1.txt
@@ -0,0 +1,3 @@
+foo
+bar
+baz
diff --git a/tests/table2_files/match1-1.txt b/tests/table2_files/match1-1.txt
new file mode 100644
index 0000000..7bd112e
--- /dev/null
+++ b/tests/table2_files/match1-1.txt
@@ -0,0 +1,3 @@
+foo
+bar
+
diff --git a/tests/table2_files/match1-2.txt b/tests/table2_files/match1-2.txt
new file mode 100644
index 0000000..42f0295
--- /dev/null
+++ b/tests/table2_files/match1-2.txt
@@ -0,0 +1,3 @@
+baz
+eggs
+
diff --git a/tests/table2_files/match1-3.txt b/tests/table2_files/match1-3.txt
new file mode 100644
index 0000000..fe05684
--- /dev/null
+++ b/tests/table2_files/match1-3.txt
@@ -0,0 +1,3 @@
+spam
+monty
+
diff --git a/tests/table2_files/match2-1.txt b/tests/table2_files/match2-1.txt
new file mode 100644
index 0000000..2b70035
--- /dev/null
+++ b/tests/table2_files/match2-1.txt
@@ -0,0 +1,2 @@
+1 2 3
+
diff --git a/tests/table2_files/match2-2.txt b/tests/table2_files/match2-2.txt
new file mode 100644
index 0000000..8db5eef
--- /dev/null
+++ b/tests/table2_files/match2-2.txt
@@ -0,0 +1,2 @@
+4 5 6
+
diff --git a/tests/table2_files/match2-3.txt b/tests/table2_files/match2-3.txt
new file mode 100644
index 0000000..ee64adb
--- /dev/null
+++ b/tests/table2_files/match2-3.txt
@@ -0,0 +1,2 @@
+7 8 9
+
diff --git a/tests/table2_files/mismatch1-1.txt b/tests/table2_files/mismatch1-1.txt
new file mode 100644
index 0000000..34ae2c6
--- /dev/null
+++ b/tests/table2_files/mismatch1-1.txt
@@ -0,0 +1,3 @@
+eggs
+spam
+
diff --git a/tests/table2_files/mismatch1-2.txt b/tests/table2_files/mismatch1-2.txt
new file mode 100644
index 0000000..7cd519a
--- /dev/null
+++ b/tests/table2_files/mismatch1-2.txt
@@ -0,0 +1,3 @@
+foo
+foo
+
diff --git a/tests/table2_files/mismatch1-3.txt b/tests/table2_files/mismatch1-3.txt
new file mode 100644
index 0000000..fbabddf
--- /dev/null
+++ b/tests/table2_files/mismatch1-3.txt
@@ -0,0 +1,3 @@
+spam
+
+eggs
diff --git a/tests/table2_files/mismatch2-1.txt b/tests/table2_files/mismatch2-1.txt
new file mode 100644
index 0000000..8d04f96
--- /dev/null
+++ b/tests/table2_files/mismatch2-1.txt
@@ -0,0 +1 @@
+1 2
diff --git a/tests/table2_files/mismatch2-2.txt b/tests/table2_files/mismatch2-2.txt
new file mode 100644
index 0000000..336a0f9
--- /dev/null
+++ b/tests/table2_files/mismatch2-2.txt
@@ -0,0 +1,2 @@
+4 5 6 
+
diff --git a/tests/table2_files/mismatch2-3.txt b/tests/table2_files/mismatch2-3.txt
new file mode 100644
index 0000000..270c611
--- /dev/null
+++ b/tests/table2_files/mismatch2-3.txt
@@ -0,0 +1 @@
+hello, world!
diff --git a/tests/table2_files/sortmatch1-1.txt b/tests/table2_files/sortmatch1-1.txt
new file mode 100644
index 0000000..4fc6926
--- /dev/null
+++ b/tests/table2_files/sortmatch1-1.txt
@@ -0,0 +1,3 @@
+baz
+foo
+bar
diff --git a/tests/test_validator.py b/tests/test_validator.py
new file mode 100644
index 0000000..453f79f
--- /dev/null
+++ b/tests/test_validator.py
@@ -0,0 +1,437 @@
+# To run these unit tests, run "python3 -m unittest" from the root of the
+# project directory.
+
+from theiavalidate.Validator import Validator
+from theiavalidate.theiavalidate import DEFAULT_NA_VALUES
+
+import numpy as np
+import pandas as pd
+import unittest
+
+
+class MockOptions:
+  """
+  Mock the "options" object that is created in theiavalidate.py. In
+  theiavalidate.py, this object is created from command-line arguments using
+  the argparse package, but here we will simulate this object with a
+  different class to more easily create Validator objects.
+  """
+  def __init__(self):
+    self.table1 = None
+    self.table2 = None
+    self.version = None
+    self.columns_to_compare = []
+    self.validation_criteria = None
+    self.column_translation = None
+    self.output_prefix = None
+    self.na_values = DEFAULT_NA_VALUES
+    self.verbose = False
+    self.debug = False
+
+
+class TestDetermineFileColumns(unittest.TestCase):
+  """
+  Test detecting which columns in the tables correspond to files. If there is at
+  least one URI and no other values except np.nan in both tables, we should
+  treat the column as a "file_column".
+  """
+  def setUp(self):
+    self.validator = Validator(MockOptions())
+
+  def run_determine_file_columns(self, data1, data2):
+    self.validator.table1 = pd.DataFrame(data1)
+    self.validator.table2 = pd.DataFrame(data2)
+    self.validator.determine_file_columns()
+
+  def test_no_file_columns(self):
+    data = {
+      "col1": [1, 2, 3],
+      "col2": ["foo", "bar", "baz"]
+    }
+    self.run_determine_file_columns(data, data)
+    self.assertEqual(len(self.validator.file_columns), 0)
+
+  def test_some_file_columns(self):
+    data1 = {
+      "col1": [1, 2, 3],
+      "col2": ["gs://foo", "gs://bar", "gs://baz"]
+    }
+    data2 = {
+      "col1": [1, 2, 3],
+      "col2": ["gs://eggs", "gs://spam", "gs://monty"]
+    }
+    self.run_determine_file_columns(data1, data2)
+    self.assertEqual(self.validator.file_columns, {"col2"})
+
+  def test_missing_uri(self):
+    data1 = {
+      "col1": [1, 2, 3],
+      "col2": ["gs://foo", np.nan, "gs://baz"]
+    }
+    data2 = {
+      "col1": [1, 2, 3],
+      "col2": ["gs://eggs", "gs://spam", "gs://monty"]
+    }
+    self.run_determine_file_columns(data1, data2)
+    self.assertEqual(self.validator.file_columns, {"col2"})
+
+  def test_both_columns_null(self):
+    data1 = {
+      "col1": ["gs://foo", "gs://bar", "gs://baz"],
+      "col2": [np.nan, np.nan, np.nan]
+    }
+    data2 = {
+      "col1": ["gs://eggs", "gs://spam", "gs://monty"],
+      "col2": [np.nan, np.nan, np.nan]
+    }
+    self.run_determine_file_columns(data1, data2)
+    self.assertEqual(self.validator.file_columns, {"col1"})
+
+  def test_one_column_null(self):
+    data1 = {
+      "col1": ["gs://foo", "gs://bar", "gs://baz"],
+      "col2": ["gs://x", "gs://y", "gs://z"]
+    }
+    data2 = {
+      "col1": ["gs://eggs", "gs://spam", "gs://monty"],
+      "col2": [np.nan, np.nan, np.nan]
+    }
+    self.run_determine_file_columns(data1, data2)
+    self.assertEqual(self.validator.file_columns, {"col1", "col2"})
+
+  def test_mixed_nulls(self):
+    data1 = {
+      "col1": ["gs://foo", "gs://foo", np.nan],
+      "col2": ["gs://x", "gs://y", np.nan]
+    }
+    data2 = {
+      "col1": ["gs://eggs", np.nan, np.nan],
+      "col2": [np.nan, "gs://b", np.nan]
+    }
+    self.run_determine_file_columns(data1, data2)
+    self.assertEqual(self.validator.file_columns, {"col1", "col2"})
+
+  def test_one_column_not_null(self):
+    data1 = {
+      "col1": ["gs://foo", "gs://bar", "gs://baz"],
+      "col2": ["gs://x", "gs://y", "gs://z"]
+    }
+    data2 = {
+      "col1": ["gs://eggs", "gs://spam", "gs://monty"],
+      "col2": [1, 2, 3]
+    }
+    self.run_determine_file_columns(data1, data2)
+    self.assertEqual(self.validator.file_columns, {"col1"})
+
+
+class TestCompareFiles(unittest.TestCase):
+  """
+  Test comparing files (exact match). Identical files or two np.nans
+  should count as an exact match, anything else should count as a mismatch.
+  """
+  SAMPLES_INDEX = ["sample1", "sample2", "sample3"]
+  COLUMNS_INDEX = ["col1", "col2"]
+
+  def setUp(self):
+    self.validator = Validator(MockOptions())
+    self.validator.table1_name = "table1"
+    self.validator.table2_name = "table2"
+    self.validator.table1_files_dir = "tests/table1_files"
+    self.validator.table2_files_dir = "tests/table2_files"
+    self.diff_dir = "/dev/null"  # discard diff files
+
+  def create_matching_files_tables(self):
+    df1 = pd.DataFrame({
+      "col1": ["gs://match1-1.txt", "gs://match1-2.txt", "gs://match1-3.txt"],
+      "col2": ["gs://match2-1.txt", "gs://match2-2.txt", "gs://match2-3.txt"]
+    })
+    df2 = pd.DataFrame({
+      "col1": ["gs://match1-1.txt", "gs://match1-2.txt", "gs://match1-3.txt"],
+      "col2": ["gs://match2-1.txt", "gs://match2-2.txt", "gs://match2-3.txt"]
+    })
+    for df in [df1, df2]:
+      df.index = self.SAMPLES_INDEX
+    return df1, df2
+
+  def create_mismatching_files_tables(self):
+    df1 = pd.DataFrame({
+      "col1": ["gs://mismatch1-1.txt", "gs://mismatch1-2.txt", "gs://mismatch1-3.txt"],
+      "col2": ["gs://mismatch2-1.txt", "gs://mismatch2-2.txt", "gs://mismatch2-3.txt"]
+    })
+    df2 = pd.DataFrame({
+      "col1": ["gs://mismatch1-1.txt", "gs://mismatch1-2.txt", "gs://mismatch1-3.txt"],
+      "col2": ["gs://mismatch2-1.txt", "gs://mismatch2-2.txt", "gs://mismatch2-3.txt"]
+    })
+    for df in [df1, df2]:
+      df.index = self.SAMPLES_INDEX
+    return df1, df2
+  
+  def create_mix_matching_files_tables(self):
+    df1 = pd.DataFrame({
+      "col1": ["gs://match1-1.txt", "gs://match1-2.txt", "gs://match1-3.txt"],
+      "col2": ["gs://mismatch2-1.txt", "gs://match2-2.txt", "gs://mismatch2-3.txt"]
+    })
+    df2 = pd.DataFrame({
+      "col1": ["gs://match1-1.txt", "gs://match1-2.txt", "gs://match1-3.txt"],
+      "col2": ["gs://mismatch2-1.txt", "gs://match2-2.txt", "gs://mismatch2-3.txt"]
+    })
+    for df in [df1, df2]:
+      df.index = self.SAMPLES_INDEX
+    return df1, df2
+
+  def create_null_files_tables(self):
+    df1 = pd.DataFrame({
+      "col1": [np.nan, "gs://match1-2.txt", "gs://match1-3.txt"],
+      "col2": [np.nan, "gs://match2-2.txt", "gs://mismatch2-3.txt"]
+    })
+    df2 = pd.DataFrame({
+      "col1": [np.nan, "gs://match1-2.txt", "gs://match1-3.txt"],
+      "col2": ["gs://match2-1.txt", np.nan, np.nan]
+    })
+    for df in [df1, df2]:
+      df.index = self.SAMPLES_INDEX
+    return df1, df2
+  
+  def test_matching_files_exact_matches(self):
+    df1, df2 = self.create_matching_files_tables()
+    self.validator.compare_files(df1, df2)
+    expected = pd.DataFrame({
+      "col1": [True, True, True],
+      "col2": [True, True, True]
+    })
+    expected.index = self.SAMPLES_INDEX
+    pd.testing.assert_frame_equal(self.validator.file_exact_matches, expected)
+
+  def test_mismatching_files_exact_matches(self):
+    df1, df2 = self.create_mismatching_files_tables()
+    self.validator.compare_files(df1, df2)
+    expected = pd.DataFrame({
+      "col1": [False, False, False],
+      "col2": [False, False, False]
+    })
+    expected.index = self.SAMPLES_INDEX
+    pd.testing.assert_frame_equal(self.validator.file_exact_matches, expected)
+
+  def test_mix_matching_files_exact_matches(self):
+    df1, df2 = self.create_mix_matching_files_tables()
+    self.validator.compare_files(df1, df2)
+    expected = pd.DataFrame({
+      "col1": [True, True, True],
+      "col2": [False, True, False]
+    })
+    expected.index = self.SAMPLES_INDEX
+    pd.testing.assert_frame_equal(self.validator.file_exact_matches, expected)
+
+  def test_null_files_exact_matches(self):
+    df1, df2 = self.create_null_files_tables()
+    self.validator.compare_files(df1, df2)
+    expected = pd.DataFrame({
+      "col1": [True, True, True],
+      "col2": [False, False, False]
+    })
+    expected.index = self.SAMPLES_INDEX
+    pd.testing.assert_frame_equal(self.validator.file_exact_matches, expected)
+
+  def test_null_files_number_of_differences(self):
+    df1, df2 = self.create_null_files_tables()
+    self.validator.compare_files(df1, df2)
+    self.validator.set_file_number_of_differences()
+    expected = pd.DataFrame({
+      self.validator.NUM_DIFFERENCES_COL: [0, 3]
+    })
+    expected.index = self.COLUMNS_INDEX
+    pd.testing.assert_frame_equal(self.validator.file_number_of_differences, expected)
+
+  def test_mismatching_files_number_of_differences(self):
+    df1, df2 = self.create_mismatching_files_tables()
+    self.validator.compare_files(df1, df2)
+    self.validator.set_file_number_of_differences()
+    expected = pd.DataFrame({
+      self.validator.NUM_DIFFERENCES_COL: [3, 3]
+    })
+    expected.index = self.COLUMNS_INDEX
+    pd.testing.assert_frame_equal(self.validator.file_number_of_differences, expected)
+
+  def test_mix_matching_files_number_of_differences(self):
+    df1, df2 = self.create_mix_matching_files_tables()
+    self.validator.compare_files(df1, df2)
+    self.validator.set_file_number_of_differences()
+    expected = pd.DataFrame({
+      self.validator.NUM_DIFFERENCES_COL: [0, 2]
+    })
+    expected.index = self.COLUMNS_INDEX
+    pd.testing.assert_frame_equal(self.validator.file_number_of_differences, expected) 
+
+  def test_null_files_number_of_differences(self):
+    df1, df2 = self.create_null_files_tables()
+    self.validator.compare_files(df1, df2)
+    self.validator.set_file_number_of_differences()
+    expected = pd.DataFrame({
+      self.validator.NUM_DIFFERENCES_COL: [0, 3]
+    })
+    expected.index = self.COLUMNS_INDEX
+    pd.testing.assert_frame_equal(self.validator.file_number_of_differences, expected)
+
+  def test_matching_files_exact_differences(self):
+    df1, df2 = self.create_matching_files_tables()
+    self.validator.compare_files(df1, df2)
+    expected = pd.DataFrame({
+      ("col1", "table1"): [np.nan, np.nan, np.nan],
+      ("col1", "table2"): [np.nan, np.nan, np.nan],
+      ("col2", "table1"): [np.nan, np.nan, np.nan],
+      ("col2", "table2"): [np.nan, np.nan, np.nan]
+    }).astype(object)
+    expected.index = self.SAMPLES_INDEX
+    pd.testing.assert_frame_equal(self.validator.file_exact_differences, expected)
+
+  def test_mismatching_files_exact_differences(self):
+    df1, df2 = self.create_mismatching_files_tables()
+    self.validator.compare_files(df1, df2)
+    expected = pd.DataFrame({
+      ("col1", "table1"): ["gs://mismatch1-1.txt", "gs://mismatch1-2.txt", "gs://mismatch1-3.txt"],
+      ("col1", "table2"): ["gs://mismatch1-1.txt", "gs://mismatch1-2.txt", "gs://mismatch1-3.txt"],
+      ("col2", "table1"): ["gs://mismatch2-1.txt", "gs://mismatch2-2.txt", "gs://mismatch2-3.txt"],
+      ("col2", "table2"): ["gs://mismatch2-1.txt", "gs://mismatch2-2.txt", "gs://mismatch2-3.txt"]
+    }).astype(object)
+    expected.index = self.SAMPLES_INDEX
+    pd.testing.assert_frame_equal(self.validator.file_exact_differences, expected)
+
+  def test_mix_matching_files_exact_differences(self):
+    df1, df2 = self.create_mix_matching_files_tables()
+    self.validator.compare_files(df1, df2)
+    expected = pd.DataFrame({
+      ("col1", "table1"): [np.nan, np.nan, np.nan],
+      ("col1", "table2"): [np.nan, np.nan, np.nan],
+      ("col2", "table1"): ["gs://mismatch2-1.txt", np.nan, "gs://mismatch2-3.txt"],
+      ("col2", "table2"): ["gs://mismatch2-1.txt", np.nan, "gs://mismatch2-3.txt"]
+    }).astype(object)
+    expected.index = self.SAMPLES_INDEX
+    pd.testing.assert_frame_equal(self.validator.file_exact_differences, expected)
+
+  def test_null_files_exact_differences(self):
+    df1, df2 = self.create_null_files_tables()
+    self.validator.compare_files(df1, df2)
+    expected = pd.DataFrame({
+      ("col1", "table1"): [np.nan, np.nan, np.nan],
+      ("col1", "table2"): [np.nan, np.nan, np.nan],
+      ("col2", "table1"): [np.nan, "gs://match2-2.txt", "gs://mismatch2-3.txt"],
+      ("col2", "table2"): ["gs://match2-1.txt", np.nan, np.nan]
+    }).astype(object)
+    expected.index = self.SAMPLES_INDEX
+    pd.testing.assert_frame_equal(self.validator.file_exact_differences, expected)
+
+class TestValidateFiles(unittest.TestCase):
+  """
+  Test comparing files using the validation criteria. EXACT follows the same
+  logic as compare_files(), SET should treat files as matching if after
+  sorting they are identical, IGNORE should "skip" the files. Other criteria
+  should result in an Exception.
+  """
+  SAMPLES_INDEX = ["sample1", "sample2", "sample3", "sample4", "sample5"]
+  COLUMNS_INDEX = ["exact_col", "set_col", "ignore_col", "float_col"]
+  TABLE1_FILE_URIS = ["gs://match1-1.txt", "gs://mismatch1-2.txt", "gs://match1-3", "gs://sortmatch1-1.txt", np.nan]
+  TABLE2_FILE_URIS = ["gs://match1-1.txt", "gs://mismatch1-2.txt", np.nan, "gs://sortmatch1-1.txt", np.nan]
+  EXACT_MATCHES_MASK = [True, False, False, False, True]
+
+  def setUp(self):
+    self.validator = Validator(MockOptions())
+    self.validator.validation_criteria = pd.DataFrame({
+      "exact_col": "EXACT",
+      "set_col": "SET",
+      "ignore_col": "IGNORE",
+      "float_col": 0.1,
+    }, index=["column", "criteria"]
+    )
+
+    # This numeric convertion is done in Validator init method
+    self.validator.validation_criteria = (self.validator.validation_criteria
+      .apply(pd.to_numeric, errors="ignore").convert_dtypes()
+    )
+
+    # assign the same URIs to each column, will test that the validation
+    # results vary depending on the the validation criterion
+    self.validator.table1 = pd.DataFrame({
+      "samples": self.SAMPLES_INDEX,
+      "exact_col": self.TABLE1_FILE_URIS,
+      "set_col": self.TABLE1_FILE_URIS,
+      "ignore_col": self.TABLE1_FILE_URIS,
+      "float_col": self.TABLE1_FILE_URIS  # uh-oh
+    })
+    
+    self.validator.table2 = pd.DataFrame({
+      "samples": self.SAMPLES_INDEX,
+      "exact_col": self.TABLE2_FILE_URIS,
+      "set_col": self.TABLE2_FILE_URIS,
+      "ignore_col": self.TABLE2_FILE_URIS,
+      "float_col": self.TABLE2_FILE_URIS  # uh-oh
+    })
+
+    # the exact matches will be identical regardless of validation criteria
+    self.validator.file_exact_matches = pd.DataFrame({
+      "exact_col": self.EXACT_MATCHES_MASK,
+      "set_col": self.EXACT_MATCHES_MASK,
+      "ignore_col": self.EXACT_MATCHES_MASK,
+      "float_col": self.EXACT_MATCHES_MASK
+    })
+    self.validator.file_exact_matches.index = self.SAMPLES_INDEX
+
+    self.validator.file_number_of_differences = pd.DataFrame({
+      self.validator.NUM_DIFFERENCES_COL: [3, 3, 3, 3]
+    })
+    self.validator.file_number_of_differences.index = self.COLUMNS_INDEX
+
+    self.validator.table1_name = "table1"
+    self.validator.table2_name = "table2"
+    self.validator.table1_files_dir = "tests/table1_files"
+    self.validator.table2_files_dir = "tests/table2_files"
+
+    self.validator.validation_table = pd.DataFrame()
+
+  def test_validate_exact(self):
+    column = self.validator.validation_criteria["exact_col"]
+    observed = self.validator.validate_files(column)
+    expected = ("EXACT", 3)
+    self.assertEqual(observed, expected)
+
+  def test_validate_ignore(self):
+    column = self.validator.validation_criteria["ignore_col"]
+    observed = self.validator.validate_files(column)
+    expected = ("IGNORE", 0)
+    self.assertEqual(observed, expected)
+
+  def test_validate_set(self):
+    column = self.validator.validation_criteria["set_col"]
+    observed = self.validator.validate_files(column)
+    expected = ("SET", 2)  # sorted file should not count as different
+    self.assertEqual(observed, expected)
+
+  def test_validate_float(self):
+    # have not implemented % difference for files
+    column = self.validator.validation_criteria["set_col"]
+    self.assertRaises(Exception, self.validator.validate_files(column))
+
+  def test_validation_table(self):
+    for column in ["exact_col", "set_col", "ignore_col"]:
+      column = self.validator.validation_criteria[column]
+      self.validator.validate_files(column)
+    
+    # these steps are done in run_validation_checks
+    self.validator.validation_table.set_index(self.validator.table1["samples"], inplace=True)
+    self.validator.validation_table.rename_axis(None, axis="index", inplace=True)
+    self.validator.validation_table.columns = pd.MultiIndex.from_tuples(
+      self.validator.validation_table.columns, names=["Column", "Table"]
+    )
+
+    # exact_col should count sortmatch file as a mismatch, while set_col should
+    # count it as a match.
+    # no column should be generated for ignore_col.
+    expected = pd.DataFrame({
+      ("exact_col", "table1"): [np.nan, "gs://mismatch1-2.txt", "gs://match1-3", "gs://sortmatch1-1.txt", np.nan],
+      ("exact_col", "table2"): [np.nan, "gs://mismatch1-2.txt", np.nan, "gs://sortmatch1-1.txt", np.nan],
+      ("set_col", "table1"): [np.nan, "gs://mismatch1-2.txt", "gs://match1-3", np.nan, np.nan],
+      ("set_col", "table2"): [np.nan, "gs://mismatch1-2.txt", np.nan, np.nan, np.nan],
+    })
+    expected.set_index(self.validator.table1["samples"], inplace=True)
+    expected.rename_axis(None, axis="index", inplace=True)
+    expected.columns = pd.MultiIndex.from_tuples(expected.columns, names=["Column", "Table"])
+    pd.testing.assert_frame_equal(self.validator.validation_table, expected)
diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py
index c0dda71..5c66144 100644
--- a/theiavalidate/Validator.py
+++ b/theiavalidate/Validator.py
@@ -1,16 +1,21 @@
 from datetime import date
 from pretty_html_table import build_table
+
+import difflib
+import filecmp
 import logging
 import numpy as np
 import os
 import pandas as pd
 import pdfkit as pdf
+import subprocess
 import sys
 
 class Validator:
   """
   This class runs the parsing module for theiavalidate
   """
+  NUM_DIFFERENCES_COL = "Number of differences (exact match)"
   def __init__(self, options):
     logging.basicConfig(encoding='utf-8', level=logging.ERROR, stream=sys.stderr)
     self.logger = logging.getLogger(__name__)
@@ -38,7 +43,18 @@ def __init__(self, options):
     self.validation_criteria = options.validation_criteria
     self.columns_to_compare = options.columns_to_compare
     self.columns_to_compare.append("samples")
-    
+
+    self.file_columns = set()  # columns that contain GCP URIs to files
+    self.table1_files_dir = "table1_files"
+    self.table2_files_dir = "table2_files"
+    self.diff_dir = "file_diffs"
+
+    # DataFrames for holding file comparison results
+    self.file_exact_matches = None
+    self.file_exact_differences = None
+    self.file_number_of_differences = None
+    self.file_validations = None
+
     self.output_prefix = options.output_prefix
     self.na_values = options.na_values
       
@@ -134,35 +150,158 @@ def count_populated_cells(self):
     self.logger.debug("Creating the summary table with the number of populated cells")
     self.summary_output = pd.concat([table1_populated_rows, table2_populated_rows], join="outer", axis=1)
   
+
+  def determine_file_columns(self):
+    """
+    Determine the columns with GCP URIs so that they are excluded from regular
+    comparisons and instead file comparisons are performed.
+    """
+    for df in [self.table1, self.table2]:
+      # select columns with at least one GCP URI among nulls
+      file_columns = df.columns[(df.apply(lambda x: x.astype(str).str.startswith("gs://")
+                                          | x.isnull()).all())
+                                & (~df.isnull().all())]
+
+      file_columns = file_columns.tolist()
+      self.file_columns.update(file_columns)
+
+    # Ensure file_columns set only has GCP URIs and nulls
+    for df in [self.table1, self.table2]:
+      remove_columns = df.columns[~(df.apply(lambda x: x.astype(str).str.startswith('gs://')
+                                             | x.isnull()).all())]
+
+      # Convert the Index object to a set
+      remove_columns = set(remove_columns.tolist())
+      self.file_columns = self.file_columns - remove_columns
+
   """
   This function performs an exact match and creates and Excel file that contains the exact match differences
   """
   def perform_exact_match(self):
     self.logger.debug("Performing an exact match and removing the sample name column")
+
+    if self.file_columns:
+      # exclude file_columns for string comparison
+      table1 = self.table1.drop(list(self.file_columns), axis=1)
+      table2 = self.table2.drop(list(self.file_columns), axis=1)
+
+      # handle file comparisons separately from strings
+      # TODO: set index to samples column in main table earlier?
+      files_df1 = self.table1.set_index("samples") 
+      files_df2 = self.table2.set_index("samples")
+      files_df1 = files_df1[list(self.file_columns)]
+      files_df2 = files_df2[list(self.file_columns)]
+      self.compare_files(files_df1, files_df2)
+      self.set_file_number_of_differences()
+    else:
+      table1 = self.table1
+      table2 = self.table2
+    
     # count the number of differences using exact string matches
     # temporarily make NaNs null since NaN != NaN for the pd.DataFrame.eq() function
     # also: remove the samplename row
-    number_of_differences = pd.DataFrame((~self.table1.fillna("NULL").astype(str).eq(self.table2.fillna("NULL").astype(str))).sum(), columns = ["Number of differences (exact match)"])
+    number_of_differences = pd.DataFrame((~table1.fillna("NULL").astype(str).eq(table2.fillna("NULL").astype(str))).sum(), columns = [self.NUM_DIFFERENCES_COL])
+
     number_of_differences.drop("samples", axis=0, inplace=True)
+
     
     # add the number of differences to the summary output table
     self.logger.debug("Adding the number of exact match differences to the summary table")
     self.summary_output = pd.concat([self.summary_output, number_of_differences], join="outer", axis=1)
-    
+
+    if self.file_number_of_differences is not None:
+      self.summary_output = self.summary_output.combine_first(self.file_number_of_differences)
+    self.summary_output[self.NUM_DIFFERENCES_COL] = self.summary_output[self.NUM_DIFFERENCES_COL].astype(int)
+
+    # ensure number of differences column is the last column
+    self.summary_output[self.NUM_DIFFERENCES_COL] = self.summary_output.pop(self.NUM_DIFFERENCES_COL)
+
     # get a table of self-other differences
     # also: temporarily drop the sample name column for comparison and then set it as the index for the output data frame
     self.logger.debug("Creating a table of self-other differences")
-    exact_differences_table = self.table1.drop("samples", axis=1).compare(self.table2.drop("samples", axis=1), keep_shape=True).set_index(self.table1["samples"])
+    exact_differences_table = table1.drop("samples", axis=1).compare(table2.drop("samples", axis=1), keep_shape=True).set_index(table1["samples"])
     # rename the self and other with the table names
     self.logger.debug("Renaming the self and other to be the table names")
     exact_differences_table.rename(columns={"self": self.table1_name, "other": self.table2_name}, level=-1, inplace=True)
+
+    # add file exact differences
+    exact_differences_table = pd.concat([exact_differences_table, self.file_exact_differences], axis=1)
+
     # replace matching values (NAs) with blanks
     self.logger.debug("Replacing all NA values with blanks")
     exact_differences_table.replace(np.nan, "", inplace=True)
+
     
     self.logger.debug("Writing the self-other differences table to a TSV file")
     exact_differences_table.to_csv(self.output_prefix + "_exact_differences.tsv", sep="\t", index=True)
 
+
+  def compare_files(self, file_df1, file_df2):
+    """
+    Determine which pairs of files referenced in the DataFrames are identical
+    """
+    self.file_exact_matches = pd.DataFrame(index=file_df1.index,
+                                           columns=file_df1.columns)
+    
+    # create similar table to one generated by df1.compare(df2)
+    # for adding to the exact differences TSV
+    self.file_exact_differences = pd.DataFrame(
+      index=file_df1.index,
+      columns=pd.MultiIndex.from_product([file_df1.columns, [self.table1_name, self.table2_name]])
+    )
+
+    for col in file_df1.columns:
+      for row in file_df1.index:
+        uri1 = file_df1.loc[row, col]
+        uri2 = file_df2.loc[row, col]
+        if pd.isnull(uri1) and pd.isnull(uri2):
+          # count two nulls as matching
+          self.file_exact_matches.loc[row, col] = True
+        elif (not pd.isnull(uri1) and not pd.isnull(uri2)):
+          file1 = os.path.join(self.table1_files_dir, uri1.removeprefix("gs://"))
+          file2 = os.path.join(self.table2_files_dir, uri2.removeprefix("gs://"))
+          is_match = filecmp.cmp(file1, file2, shallow=False)
+          self.file_exact_matches.loc[row, col] = is_match
+          if is_match:
+            # don't add URIs to exact differences table if files match
+            self.file_exact_differences.loc[row, (col, self.table1_name)] = np.nan
+            self.file_exact_differences.loc[row, (col, self.table2_name)] = np.nan
+            continue
+          else:
+            output_filename = f"{row}_{col}_diff.txt"
+            output_path = os.path.join(self.diff_dir, output_filename)
+            self.create_diff(file1, file2, output_path)
+        else:
+          # count as not matching if pair is missing
+          self.file_exact_matches.loc[row, col] = False
+        
+        self.file_exact_differences.loc[row, (col, self.table1_name)] = uri1
+        self.file_exact_differences.loc[row, (col, self.table2_name)] = uri2
+
+    self.file_exact_matches = self.file_exact_matches.astype(bool)
+
+  def set_file_number_of_differences(self):
+    self.file_number_of_differences = pd.DataFrame(columns=[self.NUM_DIFFERENCES_COL])
+    for col in self.file_exact_matches.columns:
+      count = self.file_exact_matches[col].dropna().ne(True).sum()
+      self.file_number_of_differences.loc[col] = count
+
+  def create_diff(self, file1, file2, output_path):
+    # create unified diff
+    with open(file1, "r") as f1, open(file2, "r") as f2:
+      diff = difflib.unified_diff(
+        f1.readlines(),
+        f2.readlines(),
+        fromfile=file1,
+        tofile=file2,
+        lineterm='',
+      )
+      diff = "".join(diff)
+
+      os.makedirs(os.path.dirname(output_path), exist_ok=True)
+      with open(output_path, "w") as out:
+        out.write(diff)
+
   """
   This function calculates the percent difference between two values
   """
@@ -180,14 +319,17 @@ def percent_difference(self, value1, value2):
   def validate(self, column):
     if column.name in self.table1.columns:
       # check the data type of the validation criteria; based on its type, we can assume the comparison to perform
-      if pd.api.types.is_string_dtype(column) == True: # if a string
+      if column.name in self.file_columns:
+        # handle file validation separately from strings, floats
+        validation_criterion, number_of_differences = self.validate_files(column)
+        return (validation_criterion, number_of_differences)
+      elif pd.api.types.is_string_dtype(column) == True: # if a string
         if column[0] == "EXACT": # count the number of exact match failures/differences
           self.logger.debug("Performing an exact match on column {} and counting the number of differences".format(column.name))
           exact_matches = ~self.table1[column.name].fillna("NULL").eq(self.table2[column.name].fillna("NULL"))
 
           self.validation_table[(column.name, self.table1_name)] = self.table1[column.name].where(exact_matches)
           self.validation_table[(column.name, self.table2_name)] = self.table2[column.name].where(exact_matches)
-
           number_of_differences = exact_matches.sum()
           return ("EXACT", number_of_differences)
         elif column[0] == "IGNORE": # do not check; there are no failures (0)
@@ -223,21 +365,80 @@ def validate(self, column):
     else:
       self.logger.debug("Column {} was not found; indicating np.nan failures".format(column.name))
       return ("COLUMN " + column.name + " NOT FOUND", np.nan)
+
+  def validate_files(self, column):
+    """
+    Perform validation of matching file contents based on which of EXACT,
+    IGNORE, or SET is assigned as the column's validation criterion. For SET,
+    sort lines in file before comparing.
+    """
+    validation_criterion = column.iloc[0]
+    if validation_criterion == "EXACT":
+      # we already know where the exact matches are from compare_files()
+      self.validation_table[(column.name, self.table1_name)] = (self.table1
+        .set_index("samples")[column.name]
+        .where(~self.file_exact_matches[column.name])
+        .reset_index()[column.name]
+      )
+      self.validation_table[(column.name, self.table2_name)] = (self.table2
+        .set_index("samples")[column.name]
+        .where(~self.file_exact_matches[column.name])
+        .reset_index()[column.name]
+      )
+      number_of_differences = self.file_number_of_differences.loc[column.name, self.NUM_DIFFERENCES_COL]
+    elif validation_criterion == "IGNORE":
+      number_of_differences = 0
+    elif validation_criterion == "SET":
+      # for SET, sort lines in files then compare
+      concat_columns = pd.concat([self.table1[column.name], self.table2[column.name]], axis=1)
+      concat_columns = concat_columns.applymap(
+        lambda x: x.removeprefix("gs://") if pd.notnull(x) else x
+      )
+      sorted_file_matches = concat_columns.apply(self.compare_sorted_files, axis=1)
+      self.validation_table[(column.name, self.table1_name)] = (self.table1[column.name]
+        .where(~sorted_file_matches)
+      )
+      self.validation_table[(column.name, self.table2_name)] = (self.table2[column.name]
+        .where(~sorted_file_matches)
+      )
+      number_of_differences = len(sorted_file_matches) - sorted_file_matches.sum()
+    else:
+      raise Exception("Only EXACT, IGNORE, and SET validation criteria implemented for file columns")
+    return (validation_criterion, number_of_differences)
+  
+  def compare_sorted_files(self, row):
+    """
+    Compare two files sorted alphabetically by line for a pair of file URIs.
+    """
+    file1 = row.iloc[0]
+    file2 = row.iloc[1]
+    if pd.isnull(file1) and pd.isnull(file2):
+      # count two nulls as matching
+      return True
+    if pd.notnull(file1) and pd.notnull(file2):
+      file1 = os.path.join(self.table1_files_dir, file1)
+      file2 = os.path.join(self.table2_files_dir, file2)
+      with open(file1, "r") as f1, open(file2, "r") as f2:
+          lines1 = f1.readlines()
+          lines2 = f2.readlines()
+      lines1.sort()
+      lines2.sort()
+      return lines1 == lines2
+    # count null + not-null as mismatching
+    return False
   
   """ 
   This function creates, formats, and runs the validation criteria checks
-  """                                                                
+  """
   def run_validation_checks(self):
       self.validation_table = pd.DataFrame()
       
       self.logger.debug("Performing the validation checks")
       self.summary_output[["Validation Criteria", "Number of samples failing the validation criteria"]] = pd.DataFrame(self.validation_criteria.apply(lambda x: self.validate(x), result_type="expand")).transpose()
-      
       # format the validation criteria differences table
       self.logger.debug("Formatting the validation criteria differences table")
       self.validation_table.set_index(self.table1["samples"], inplace=True)
       self.validation_table.rename_axis(None, axis="index", inplace=True)
-      self.validation_table.transpose()
       
       self.validation_table.columns = pd.MultiIndex.from_tuples(self.validation_table.columns, names=["Column", "Table"])
 
@@ -317,6 +518,18 @@ def compare(self):
     
     self.logger.info("Counting how many cells have values")
     self.count_populated_cells()
+
+    self.logger.info("Determining columns for file comparisons")
+    self.determine_file_columns()
+
+    dir1 = f"{self.table1_files_dir}/"
+    dir2 = f"{self.table2_files_dir}/"
+    os.mkdir(dir1)
+    os.mkdir(dir2)
+
+    self.logger.info("Localizing files to compare...")
+    self.table1[list(self.file_columns)].apply(localize_files, directory=dir1)
+    self.table2[list(self.file_columns)].apply(localize_files, directory=dir2)
     
     self.logger.info("Performing an exact string match")
     self.perform_exact_match()
@@ -329,4 +542,17 @@ def compare(self):
     self.make_pdf_report()
     
     self.logger.info("Done!")
-    
\ No newline at end of file
+
+def localize_files(row, directory):
+  """
+  Download files to compare from GCP.
+  """
+  for value in row:
+    if isinstance(value, str) and value.startswith("gs://"):
+      # it would be much faster to copy files all at once, but any files with
+      # the same name would be clobbered, so create local directories matching
+      # gsutil path and loop to copy
+      remote_path = os.path.dirname(value.removeprefix("gs://"))
+      destination_path = os.path.join(directory, remote_path)
+      os.makedirs(destination_path, exist_ok=True)
+      subprocess.run(["gsutil", "-m", "cp", value, destination_path])
diff --git a/theiavalidate/__init__.py b/theiavalidate/__init__.py
index 9a65ac3..c0986f5 100644
--- a/theiavalidate/__init__.py
+++ b/theiavalidate/__init__.py
@@ -1 +1,2 @@
-__VERSION__ = "v0.0.1"
\ No newline at end of file
+__VERSION__ = "v0.0.1"
+import os, sys; sys.path.append(os.path.dirname(os.path.realpath(__file__)))
\ No newline at end of file
diff --git a/theiavalidate/theiavalidate.py b/theiavalidate/theiavalidate.py
index bbdc82d..c04ae3b 100644
--- a/theiavalidate/theiavalidate.py
+++ b/theiavalidate/theiavalidate.py
@@ -5,6 +5,11 @@
 from __init__ import __VERSION__
 from Validator import Validator
 
+DEFAULT_NA_VALUES = [
+  '-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'n/a',
+  '', '#NA', 'NULL', 'null', 'NaN','-NaN', 'nan', '-nan', 'None'
+]
+
 def main():
   parser = argparse.ArgumentParser(
     description = "This tool compares two tab-delimited files and outputs a report of the differences between the two files.",
@@ -25,8 +30,8 @@ def main():
   parser.add_argument("-o", "--output_prefix", 
                       help="the output file name prefix\ndo not include any spaces", default="theiavalidate", metavar="\b")
   parser.add_argument("-n", "--na_values", 
-                      help="the values that should be considered NA\ndefault values = ['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'n/a', '', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', 'None']", 
-                      default= ['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'n/a', '', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', 'None'], metavar="\b", type=int)
+                      help=f"the values that should be considered NA\ndefault values = {DEFAULT_NA_VALUES}", 
+                      default=DEFAULT_NA_VALUES, metavar="\b", type=int)
   parser.add_argument("--verbose", 
                       help="increase stdout verbosity", action="store_true", default=False)
   parser.add_argument("--debug",