-
Notifications
You must be signed in to change notification settings - Fork 462
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
446 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
import argparse | ||
import csv | ||
import os | ||
|
||
import torch | ||
from PIL import Image | ||
from tqdm import tqdm | ||
from transformers import AutoModel, AutoTokenizer, CLIPImageProcessor | ||
|
||
|
||
def benchmark_model(model_name, benchmark_dir, device='cuda'): | ||
# model_path = '/mnt/petrelfs/share_data/wangwenhai/llm/internvl_14b_224px' | ||
model_path = 'OpenGVLab/InternVL-14B-224px' | ||
model = AutoModel.from_pretrained( | ||
model_path, | ||
torch_dtype=torch.float16, | ||
low_cpu_mem_usage=True, | ||
trust_remote_code=True).cuda().eval() | ||
preprocess = CLIPImageProcessor.from_pretrained(model_path) | ||
tokenizer = AutoTokenizer.from_pretrained( | ||
model_path, use_fast=False, add_eos_token=True) | ||
tokenizer.pad_token_id = 0 # set pad_token_id to 0 | ||
image_dir = os.path.join(benchmark_dir, 'MLLM_VLM Images') | ||
csv_file = os.path.join(benchmark_dir, 'Questions.csv') | ||
|
||
csv_outfile = open('output.csv', 'w', newline='') | ||
csv_writer = csv.writer(csv_outfile) | ||
csv_writer.writerow(['qid1', 'qid2', 'pred1', 'pred2', 'gt1', 'gt2', 'q1score', 'q2score']) # header | ||
|
||
categories = [ | ||
'Orientation and Direction', 'Presence of Specific Features', | ||
'State and Condition', 'Quantity and Count', | ||
'Positional and Relational Context', 'Color and Appearance', | ||
'Structural Characteristics', 'Texts', | ||
'Viewpoint and Perspective' | ||
] | ||
|
||
pair_accuracies = {category: 0 for category in categories} | ||
num_pairs = 0 | ||
|
||
with open(csv_file, 'r') as f: | ||
reader = csv.reader(f) | ||
next(reader) # skip header | ||
for i, row in tqdm(enumerate(reader)): | ||
qid1, qtype1, statement1 = row | ||
|
||
# Get next row for the pair | ||
row = next(reader, None) | ||
if not row: | ||
break | ||
qid2, qtype2, statement2 = row | ||
|
||
qid1, qid2 = int(qid1), int(qid2) | ||
|
||
img1 = Image.open(os.path.join(image_dir, qtype1, f'{qid1}.jpg')) | ||
img1 = img1.resize((224, 224)) | ||
img2 = Image.open(os.path.join(image_dir, qtype1, f'{qid2}.jpg')) | ||
img2 = img2.resize((224, 224)) | ||
|
||
prefix = 'summarize:' | ||
# text1 = prefix + 'a photo of ' + statement1 | ||
# text2 = prefix + 'a photo of ' + statement2 | ||
text1 = prefix + statement1 | ||
text2 = prefix + statement2 | ||
|
||
text1 = tokenizer(text1, return_tensors='pt', max_length=80, | ||
truncation=True, padding='max_length').input_ids.cuda() | ||
text2 = tokenizer(text2, return_tensors='pt', max_length=80, | ||
truncation=True, padding='max_length').input_ids.cuda() | ||
|
||
img1 = preprocess(images=img1, return_tensors='pt').pixel_values.to(torch.float16).cuda() | ||
img2 = preprocess(images=img2, return_tensors='pt').pixel_values.to(torch.float16).cuda() | ||
imgs = torch.cat((img1, img2), dim=0) | ||
|
||
with torch.no_grad(): | ||
logits_per_image1, logits_per_text1 = model(image=imgs, text=text1, mode=model_name) | ||
logits_per_image2, logits_per_text2 = model(image=imgs, text=text2, mode=model_name) | ||
|
||
probs1 = logits_per_text1.float().softmax(dim=-1).cpu().numpy() | ||
probs2 = logits_per_text2.float().softmax(dim=-1).cpu().numpy() | ||
|
||
img1_score1 = probs1[0][0] | ||
img1_score2 = probs2[0][0] | ||
|
||
pred1 = 'img1' if img1_score1 > 0.5 else 'img2' | ||
pred2 = 'img1' if img1_score2 > 0.5 else 'img2' | ||
|
||
gt1 = 'img1' if qid1 % 2 == 1 else 'img2' | ||
gt2 = 'img1' if qid2 % 2 == 1 else 'img2' | ||
|
||
csv_writer.writerow([qid1, qid2, pred1, pred2, gt1, gt2, img1_score1, img1_score2]) | ||
|
||
current_category = categories[num_pairs // 15] | ||
if pred1 == gt1 and pred2 == gt2: | ||
pair_accuracies[current_category] += 1 | ||
num_pairs += 1 | ||
|
||
csv_outfile.close() | ||
|
||
# Calculate percentage accuracies | ||
for category in pair_accuracies: | ||
pair_accuracies[category] = (pair_accuracies[category] / (num_pairs // len(categories))) * 100 | ||
|
||
return pair_accuracies | ||
|
||
|
||
parser = argparse.ArgumentParser(description='Process a directory path.') | ||
|
||
# Adding an argument for the directory path | ||
parser.add_argument('--directory', type=str, help='The path to the directory') | ||
|
||
# Parsing the arguments | ||
args = parser.parse_args() | ||
|
||
# InternVL models | ||
models = ['InternVL-C', 'InternVL-G'] | ||
|
||
results = {f'{model}': benchmark_model(model, args.directory) for model in models} | ||
|
||
print(results) | ||
|
||
# Convert results to format suitable for star plot | ||
categories = results[list(results.keys())[0]].keys() | ||
print(f'categories: {categories}') | ||
data = {'Categories': list(categories)} | ||
print(f'data: {data}') | ||
for model in list(results.keys()): | ||
data[model] = [results[model][category] for category in categories] | ||
print(f'data: {data}') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.