From 23d48ea64709e732760e1e6c862bd32230e05a39 Mon Sep 17 00:00:00 2001 From: anujj Date: Sat, 12 Oct 2024 02:01:54 +0530 Subject: [PATCH] Add TensorRT-Model-Optimizer INT4 AWQ support in onnxruntime tools (#22390) [TensorRT-Model-Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer) have a implementation for INT4 AWQ. Adding the support in onnxruntime tools to quantized the models with TensorRT-Model-Optimizer --- .../quantization/matmul_4bits_quantizer.py | 353 +++++++++++++++++- 1 file changed, 351 insertions(+), 2 deletions(-) diff --git a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py index 16ad36c48cc74..91819a2078c77 100644 --- a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py +++ b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py @@ -225,6 +225,252 @@ def __init__( self.accuracy_level = accuracy_level +class NVAWQWeightOnlyQuantConfig(WeightOnlyQuantConfig): + def __init__( + self, + tokenizer_dir, + dataset_name="cnn", + cache_dir="./cache", + calibration_method="awq_lite", + ): + """ + Configuration for the nvidia_awq quantization method. + + Args: + tokenizer_dir (str): pathof the tokenizer dir. + dataset_name (str): Name of the dataset. + cache_dir (str): Directory for caching. + calibration_method (str): calib method for nvidia_awq. + """ + # Import torch and DataLoader + try: + import torch + from torch.utils.data import DataLoader + + self.torch = torch + self.DataLoader = DataLoader + except ImportError: + print( + "Error: The 'torch' library is required but not installed. Please install it using 'pip install torch'." + ) + raise ImportError("torch is not installed. Exiting.") from None + + # Import datasets + try: + from datasets import load_dataset + + self.load_dataset = load_dataset + except ImportError: + print( + "Error: The 'datasets' library is required but not installed. Please install it using 'pip install datasets'." + ) + raise ImportError("datasets is not installed. Exiting.") from None + + # Import transformers + try: + from transformers import AutoConfig, AutoTokenizer + + self.AutoConfig = AutoConfig + self.AutoTokenizer = AutoTokenizer + except ImportError: + print( + "Error: The 'transformers' library is required but not installed. Please install it using 'pip install transformers'." + ) + raise ImportError("transformers is not installed. Exiting.") from None + + super().__init__( + algorithm="nvidia_awq", + quant_format=quant_format, + op_types_to_quantize=None, # Assuming op_types_to_quantize is handled elsewhere + quant_axes=None, # Assuming quant_axes is handled elsewhere + ) + + # Determine the device + device = self.torch.device("cuda" if self.torch.cuda.is_available() else "cpu") + + calib_inputs = self.get_calib_inputs( + dataset_name=dataset_name, + model_name=tokenizer_dir, + cache_dir=cache_dir, + calib_size=32, + batch_size=1, + block_size=512, + device=device, + use_fp16=True, + use_buffer_share=False, + add_past_kv_inputs=True, + max_calib_rows_to_load=128, + add_position_ids=True, + ) + + self.calibration_data_reader = calib_inputs + self.calibration_method = calibration_method + + def make_model_input( + self, + config, + input_ids_arg, + attention_mask_arg, + add_past_kv_inputs, + device, + use_fp16, + use_buffer_share, + add_position_ids, + ): + # Access torch from the instance variable + torch = self.torch + + input_ids = input_ids_arg + attention_mask = attention_mask_arg + + if isinstance(input_ids_arg, list): + input_ids = torch.tensor(input_ids_arg, device=device, dtype=torch.int64) + attention_mask = torch.tensor(attention_mask_arg, device=device, dtype=torch.int64) + + inputs = { + "input_ids": input_ids.contiguous(), + "attention_mask": attention_mask.contiguous(), + } + + if add_position_ids: + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + inputs["position_ids"] = position_ids.contiguous() + + if add_past_kv_inputs: + torch_dtype = torch.float16 if use_fp16 else torch.float32 + batch_size, sequence_length = input_ids.shape + max_sequence_length = config.max_position_embeddings + num_heads, head_size = ( + config.num_key_value_heads, + config.hidden_size // config.num_attention_heads, + ) + for i in range(config.num_hidden_layers): + past_key = torch.zeros( + batch_size, + num_heads, + max_sequence_length if use_buffer_share else 0, + head_size, + device=device, + dtype=torch_dtype, + ) + past_value = torch.zeros( + batch_size, + num_heads, + max_sequence_length if use_buffer_share else 0, + head_size, + device=device, + dtype=torch_dtype, + ) + inputs.update( + { + f"past_key_values.{i}.key": past_key.contiguous(), + f"past_key_values.{i}.value": past_value.contiguous(), + } + ) + + return inputs + + def get_calib_inputs( + self, + dataset_name, + model_name, + cache_dir, + calib_size, + batch_size, + block_size, + device, + use_fp16, + use_buffer_share, + add_past_kv_inputs, + max_calib_rows_to_load, + add_position_ids, + ): + # Access transformers and datasets from the instance variables + auto_config = self.AutoConfig + auto_tokenizer = self.AutoTokenizer + load_dataset = self.load_dataset + + config = auto_config.from_pretrained( + model_name, use_auth_token=True, cache_dir=cache_dir, trust_remote_code=True + ) + tokenizer = auto_tokenizer.from_pretrained( + model_name, use_auth_token=True, cache_dir=cache_dir, trust_remote_code=True + ) + tokenizer.add_special_tokens({"pad_token": "[PAD]"}) + tokenizer.pad_token = tokenizer.eos_token + + assert calib_size <= max_calib_rows_to_load, "calib size should be no more than max_calib_rows_to_load" + + if "cnn" in dataset_name: + dataset2 = load_dataset("cnn_dailymail", name="3.0.0", split="train").select(range(max_calib_rows_to_load)) + column = "article" + elif "pile" in dataset_name: + dataset2 = load_dataset("mit-han-lab/pile-val-backup", split="validation") + column = "text" + else: + raise ValueError(f'dataset "{dataset_name}" not supported') + + dataset2 = dataset2[column][:calib_size] + batch_encoded = tokenizer.batch_encode_plus( + dataset2, return_tensors="pt", padding=True, truncation=True, max_length=block_size + ) + batch_encoded = batch_encoded.to(device) + batch_encoded_input_ids = batch_encoded["input_ids"] + batch_encoded_attention_mask = batch_encoded["attention_mask"] + + # Access DataLoader from the instance variable + data_loader = self.DataLoader + + calib_dataloader_input_ids = data_loader(batch_encoded_input_ids, batch_size=batch_size, shuffle=False) + calib_dataloader_attention_mask = data_loader( + batch_encoded_attention_mask, batch_size=batch_size, shuffle=False + ) + + assert len(calib_dataloader_input_ids.dataset) == len(calib_dataloader_attention_mask.dataset) + assert len(calib_dataloader_input_ids) == len(calib_dataloader_attention_mask) + + number_of_batched_samples = calib_size // batch_size + + batched_input_ids = [] + for idx, data in enumerate(calib_dataloader_input_ids): + batched_input_ids.append(data) + if idx == (number_of_batched_samples - 1): + break + + batched_attention_mask = [] + for idx, data in enumerate(calib_dataloader_attention_mask): + batched_attention_mask.append(data) + if idx == (number_of_batched_samples - 1): + break + + print( + f"\n--Quantize-Script-- number_of_batched_samples={number_of_batched_samples}, " + f"batch-input-ids-list-len={len(batched_input_ids)}, batched_attention_mask={len(batched_attention_mask)}\n" + ) + + batched_inputs_list = [] + for i in range(number_of_batched_samples): + input_ids = batched_input_ids[i] + attention_mask = batched_attention_mask[i] + + inputs = self.make_model_input( + config, + input_ids, + attention_mask, + add_past_kv_inputs, + device, + use_fp16, + use_buffer_share, + add_position_ids, + ) + inputs = {input_name: torch_tensor.cpu().numpy() for input_name, torch_tensor in inputs.items()} + batched_inputs_list.append(inputs) + + print(f"\n--Quantize-Script-- number of batched inputs = {len(batched_inputs_list)}\n") + return batched_inputs_list + + def is_divisible(val1, val2): return int(val2 * np.ceil(val1 / val2)) == val1 @@ -777,6 +1023,49 @@ def quantize(self, node: NodeProto, graph_stack: list[GraphProto]) -> list[NodeP return results +class NVAWQWeightOnlyQuantizer: + def __init__( + self, + config: NVAWQWeightOnlyQuantConfig, + ): + self.config = config + + def quantize_awq(self, model: ModelProto | str) -> ModelProto: + """ + Perform nvidia_awq quantization using ModelOpt's int4 quantize function. + + Args: + model (ModelProto): The ONNX model to quantize. + + Returns: + ModelProto: The quantized ONNX model. + """ + try: + from modelopt.onnx.quantization.int4 import quantize as quantize_int4 + except ImportError: + print( + "Please ensure that the 'modelopt' package is installed. Please install it using pip install nvidia_modelopt." + ) + raise ImportError( + "modelopt is not installed. Please install it using pip install nvidia_modelopt. Exiting." + ) from None + + logger.info("Starting nvidia_awq quantization...") + + # Prepare calibration inputs + calib_inputs = self.config.calibration_data_reader + + # Perform quantization using ModelOpt's int4 quantize function + quantized_model = quantize_int4( + model, + calibration_method=self.config.calibration_method, + calibration_data_reader=calib_inputs, + ) + + logger.info("Completed nvidia_awq quantization.") + return quantized_model + + # TODO(fajin): change class name class MatMul4BitsQuantizer: """ @@ -821,6 +1110,7 @@ def __init__( self.nodes_to_exclude = set(nodes_to_exclude) self.nodes_to_include = set(nodes_to_include) if nodes_to_include else None self.node_quantizer = None + if algo_config is None: algo_config = DefaultWeightOnlyQuantConfig( block_size=block_size, @@ -835,6 +1125,8 @@ def __init__( self.node_quantizer = HQQWeightOnlyQuantizer(self.algo_config) elif algo_config.algorithm == "DEFAULT": self.node_quantizer = DefaultWeightOnlyQuantizer(self.algo_config) + elif algo_config.algorithm == "nvidia_awq": + self.node_quantizer = NVAWQWeightOnlyQuantizer(self.algo_config) def _process_subgraph(self, graph_stack: list[GraphProto]): new_nodes = [] @@ -966,6 +1258,16 @@ def process(self): self._process_subgraph(graph_stack) self.model.clean_initializers() + elif self.algo_config.algorithm == "nvidia_awq": + + # Handle nvidia_awq quantization + logger.info("Processing nvidia_awq quantization...") + self.model = self.node_quantizer.quantize_awq( + self.model.model if self.model_path is None else self.model_path + ) + logger.info("Completed nvidia_awq quantization.") + self.model = ONNXModel(self.model) # Ensure the model is wrapped back into ONNXModel + self.model.clean_initializers() else: # use IntelĀ® Neural Compressor for RTN or GPTQ weight-only quantize algorithm try: @@ -1012,7 +1314,7 @@ def parse_args(): "--quant_method", default="default", type=str, - choices=["default", "hqq", "rtn", "gptq"], + choices=["default", "hqq", "rtn", "gptq", "nvidia_awq"], help="the algorithm used to quantize weight, \nrtn and gptq leverage IntelĀ® Neural Compressor", ) parser.add_argument("--bits", default=4, type=int, help="the target bits to represent weight") @@ -1076,7 +1378,33 @@ def parse_args(): "Specify the axis to quantize for an op. Default {MatMul:0, Gather:1}" "Example: --quant_axes MatMul:0 Gather:1", ) - + # Group arguments specific to nvidia_awq + nv_awq_config = parser.add_argument_group("nvidia_awq", "Arguments specific to nvidia_awq quantization") + nv_awq_config.add_argument( + "--calib_dataset_name", + type=str, + default="cnn", + help="Name of the calibration dataset for nvidia_awq.", + ) + nv_awq_config.add_argument( + "--tokenizer_dir", + type=str, + required=False, + help="Path of the tokenizer dir.", + ) + nv_awq_config.add_argument( + "--calibration_method", + type=str, + required=False, + choices=["awq", "awq_clip"], + help="Support two options, awq implementation and weight clipping.", + ) + nv_awq_config.add_argument( + "--cache_dir", + type=str, + default="./cache", + help="Cache directory for calibration data.", + ) return parser.parse_args() @@ -1117,6 +1445,27 @@ def parse_args(): quant_config = RTNWeightOnlyQuantConfig(op_types_to_quantize=op_types_to_quantize) elif args.quant_method == "gptq": quant_config = GPTQWeightOnlyQuantConfig(block_size=args.block_size, op_types_to_quantize=op_types_to_quantize) + elif args.quant_method == "nvidia_awq": + + if quant_format == QuantFormat.QOperator: + logger.warning("QOperator is not applicable to nvidia_awq. overriding the value to QDQ") + quant_format = QuantFormat.QDQ + + model = input_model_path + if args.calibration_method is not None: + if args.calibration_method == "awq": + calibration_method = "awq_lite" + else: + calibration_method = "awq_clip" + else: + calibration_method = "awq_lite" + + quant_config = NVAWQWeightOnlyQuantConfig( + dataset_name=args.calib_dataset_name, + tokenizer_dir=args.tokenizer_dir, + cache_dir=args.cache_dir, + calibration_method=calibration_method, + ) else: raise ValueError(f"Unsupported quantization method: {args.quant_method}")