You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I am concatenating the inputs in a list for each batch and fit them to HDBSCAN. This naturally would be slower as I go through more batches.
Hence I thought of a solution where I will collect the resulting medoids and their labels as I go through the batches so some of them can be reused in the next batches , and pop the oldest item in my input list every N iteration.
Below is a simple version of my original implementation:
`import os
import gzip
import pickle
import numpy as np
import torch
from tqdm import tqdm
import itertools
from sklearn.cluster import HDBSCAN
args = {'traindir':'/path/to/your/data/folder', 'minclustersize': 5, 'n_jobs': -1, 'ghost_freq': 5, 'ghost_coeff': 0.5, 'clustersavefreq': 10, 'batch_size': 32, 'workers': 0} # Update this with your arguments
def train(train_loader):
resultpath = '/path/to/your/result/folder' # Update this with your result folder path
if os.path.exists(os.path.join(resultpath, 'train_hdbscan_Fin.gzip')):
with gzip.open(os.path.join(resultpath, 'train_hdbscan_Fin.gzip'), 'rb') as f:
return pickle.load(f)
train_checkpoint_queue = []
model = HDBSCAN(min_cluster_size=args['minclustersize'], n_jobs=args['n_jobs'], store_centers='medoid', copy=True)
input_list, medoids_list = [], []
for i, (inputs, path) in enumerate(tqdm(train_loader)):
input_tensors = torch.stack([input.cuda() for input in inputs]).detach().cpu().clone().numpy()
input_list.append(input_tensors)
if i % args['ghost_freq'] == 0:
items_to_pop = max(1, int(i * args['ghost_coeff']))
for _ in range(items_to_pop):
if input_list:
input_list.pop(0)
if input_list:
input_data = np.concatenate(input_list, axis=0)
model.fit(input_data)
else:
model.fit(input_tensors)
if model.labels_.size > 0:
new_labels = model.labels_[~np.in1d(model.labels_, model.labels_[:-1])]
if new_labels.size > 0:
model.labels_ = np.concatenate((model.labels_.reshape(1, -1)), axis=0)
# Determine the number of rows and columns in model.medoids_
num_rows, num_cols = model.medoids_.shape
# Generate the boolean index array and reshape it to match the shape of model.medoids_
bool_index = ~np.in1d(model.medoids_.ravel(), model.medoids_[:-1].ravel()).reshape((num_rows, num_cols))
if model.medoids_.size > 0:
#new_medoids = model.medoids_[~np.in1d(model.medoids_, model.medoids_[:-1])]
new_medoids = model.medoids_[bool_index]
if new_medoids.size > 0:
model.medoids_ = np.concatenate((model.medoids_.reshape(1, -1)), axis=0)
if i % args['clustersavefreq'] == 0:
output_labels_list = model.labels_.tolist()
unique_labels = list(set(output_labels_list))
medoids_list = model.medoids_.tolist()
unique_medoids = list(set(medoids_list))
with open(resultpath+f'/learn_num_clusters{str(i)}.txt', 'w+', encoding="utf-8") as f:
print('Unique clusters from the current alive batch ', str(unique_labels) + '\n' + "Length of medoids_list:", str(len(unique_medoids))
+"Size of input list ", str(len(input_list))
, file=f)
# save_checkpoint({'batch_idx': i, 'input_list': input_list, 'model': model, 'medoids_list': model.medoids_, 'model_labels': model.labels_, 'mainlist': mainlist}, filename=f'learn_checkpoint{str(i)}.pth')
# manage_learn_checkpoints(i, train_checkpoint_queue, resultpath)
Define the save_checkpoint and manage_learn_checkpoints functions as needed
`
I am not sure if this is being done correctly because when my Nth batch has 419 samples, the labels generated are from -1 to 356, but if the next batch only has 4 samples, the labels are between -1 to 4 though some of those samples could clearly belong to 20th label. Or maybe I am misunderstanding how model.labels_ work. Any insights/suggestions for me? Thank you in advance
The text was updated successfully, but these errors were encountered:
I am implementing HDBSCAN to batches of data.
I am concatenating the inputs in a list for each batch and fit them to HDBSCAN. This naturally would be slower as I go through more batches.
Hence I thought of a solution where I will collect the resulting medoids and their labels as I go through the batches so some of them can be reused in the next batches , and pop the oldest item in my input list every N iteration.
Below is a simple version of my original implementation:
`import os
import gzip
import pickle
import numpy as np
import torch
from tqdm import tqdm
import itertools
from sklearn.cluster import HDBSCAN
args = {'traindir':'/path/to/your/data/folder', 'minclustersize': 5, 'n_jobs': -1, 'ghost_freq': 5, 'ghost_coeff': 0.5, 'clustersavefreq': 10, 'batch_size': 32, 'workers': 0} # Update this with your arguments
def train(train_loader):
resultpath = '/path/to/your/result/folder' # Update this with your result folder path
Define the save_checkpoint and manage_learn_checkpoints functions as needed
`
I am not sure if this is being done correctly because when my Nth batch has 419 samples, the labels generated are from -1 to 356, but if the next batch only has 4 samples, the labels are between -1 to 4 though some of those samples could clearly belong to 20th label. Or maybe I am misunderstanding how model.labels_ work. Any insights/suggestions for me? Thank you in advance
The text was updated successfully, but these errors were encountered: