Skip to content

Commit

Permalink
reddit conditional
Browse files Browse the repository at this point in the history
  • Loading branch information
XiaohanZhangCMU committed May 24, 2024
1 parent 796300b commit 88838fa
Showing 1 changed file with 5 additions and 4 deletions.
9 changes: 5 additions & 4 deletions tests/test_streaming_remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,9 @@ def get_dataset(name: str,
},
'reddit_table': {
'local': f'/tmp/test_random_reddit_table_05May1029',
'remote': 'SELECT text, added FROM main.reddit.data',
'num_samples': 378156152,
# 'remote': 'SELECT text, added FROM main.reddit.data WHERE id<1000000 AND id >100000',
'remote': "SELECT text, added FROM main.reddit.data WHERE created <= \'2023-03-01\' AND created >= \'2023-01-01\'",
'num_samples': 9979302, # 113949, # 378156152,
'class': StreamingDataset,
'kwargs': {
'cluster_id': "0523-224100-tid6mais"
Expand Down Expand Up @@ -125,9 +126,9 @@ def test_streaming_remote_dataset(name: str, split: str) -> None:
#if __name__ == "__main__":
# test_streaming_remote_dataset(name = 'refinedweb', split=None)
# test_streaming_remote_dataset(name = 'dummy_table', split=None)
test_streaming_remote_dataset(name = 'random_cpt_table', split=None)
#test_streaming_remote_dataset(name = 'random_cpt_table', split=None)
# test_streaming_remote_dataset(name = 'random_large_table', split=None)
# test_streaming_remote_dataset(name = 'reddit_table', split=None)
test_streaming_remote_dataset(name = 'reddit_table', split=None)
# test_streaming_remote_dataset(name = 'debug_local', split=None)


0 comments on commit 88838fa

Please sign in to comment.