From f17420bf3ccb01ff8a428fa0c1829b35c466b8ff Mon Sep 17 00:00:00 2001
From: Paniz Ojaghi <90856064+Panizghi@users.noreply.github.com>
Date: Sat, 31 Aug 2024 20:28:24 -0400
Subject: [PATCH] Delete src/main/python/safetensors/test.py

---
 src/main/python/safetensors/test.py | 74 -----------------------------
 1 file changed, 74 deletions(-)
 delete mode 100644 src/main/python/safetensors/test.py

diff --git a/src/main/python/safetensors/test.py b/src/main/python/safetensors/test.py
deleted file mode 100644
index eb7dbb2f0..000000000
--- a/src/main/python/safetensors/test.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import json
-import torch
-import os
-import gzip
-from safetensors.torch import save_file, load_file
-
-# Define paths
-input_file_path = "/home/p2ojaghi/anserini/anserini/src/main/python/safetensors/sample_input.jsonl"
-output_directory = "/home/p2ojaghi/anserini/anserini/src/main/python/safetensors"
-
-# Ensure the output directory exists
-if not os.path.exists(output_directory):
-    os.makedirs(output_directory)
-
-# Check if the input file is a .gz file and convert it to .jsonl if necessary
-if input_file_path.endswith('.gz'):
-    with gzip.open(input_file_path, 'rt') as gz_file:
-        jsonl_file_path = input_file_path.replace('.gz', '.jsonl')
-        with open(jsonl_file_path, 'w') as jsonl_file:
-            for line in gz_file:
-                jsonl_file.write(line)
-    input_file_path = jsonl_file_path
-
-# Check if the input file is a .jsonl file
-elif not input_file_path.endswith('.jsonl'):
-    raise ValueError("Input file must be a .jsonl or .gz file")
-
-# Get the base name of the input file for output file names
-base_name = os.path.basename(input_file_path).replace('.jsonl', '')
-
-vectors_path = os.path.join(output_directory, f'{base_name}_vectors.safetensors')
-docids_path = os.path.join(output_directory, f'{base_name}_docids.safetensors')
-
-# Initialize lists to hold data
-vectors = []
-docids = []
-
-# Process the JSONL file to extract vectors and docids
-with open(input_file_path, 'r') as file:
-    for line in file:
-        entry = json.loads(line)
-        # Ensure that the vector starts with a valid number
-        if isinstance(entry['vector'][0], float):
-            vectors.append(entry['vector'])
-            docid = entry['docid']
-            docid_ascii = [ord(char) for char in docid]  # Convert docid to ASCII values
-            docids.append(docid_ascii)
-        else:
-            print(f"Skipped invalid vector entry with docid: {entry['docid']}")
-
-# Convert lists to tensors
-vectors_tensor = torch.tensor(vectors, dtype=torch.float32)  # Use float32 for memory efficiency
-docids_tensor = torch.nn.utils.rnn.pad_sequence([torch.tensor(d, dtype=torch.int64) for d in docids], batch_first=True)
-
-# Debugging: Print out the first few document IDs and vectors
-print("Sample document IDs (ASCII):", docids[:5])
-print("Sample vectors:", vectors[:5])
-
-# Save the tensors to SafeTensors files
-save_file({'vectors': vectors_tensor}, vectors_path)
-save_file({'docids': docids_tensor}, docids_path)
-
-print(f"Saved vectors to {vectors_path}")
-print(f"Saved docids to {docids_path}")
-
-vectors_path = '/home/p2ojaghi/anserini/anserini/src/main/python/safetensors/sample_input_vectors.safetensors'
-docids_path = '/home/p2ojaghi/anserini/anserini/src/main/python/safetensors/sample_input_docids.safetensors'
-
-# Load vectors and docids
-loaded_vectors = load_file(vectors_path)['vectors']
-loaded_docids = load_file(docids_path)['docids']
-
-print(f"Loaded vectors: {loaded_vectors}")
-print(f"Loaded document IDs (ASCII): {loaded_docids}")
\ No newline at end of file