diff --git a/README.md b/README.md index d5ded729783..a52468cfd7a 100644 --- a/README.md +++ b/README.md @@ -165,6 +165,10 @@ You can install ComfyUI in Apple Mac silicon (M1 or M2) with any recent macOS ve ```pip install torch-directml``` Then you can launch ComfyUI with: ```python main.py --directml``` +#### CPUs with AVX-512 BFloat16 support + +If you have a CPU with AVX-512 BFloat16 instruction set support, you can increase performance of KSampler node and decrease memory usage by approximately 50% by running `python main.py --cpu --bf16-vae --bf16-unet`. + # Running ```python main.py``` diff --git a/comfy/cli_args.py b/comfy/cli_args.py index 2397de3d624..7b10124e943 100644 --- a/comfy/cli_args.py +++ b/comfy/cli_args.py @@ -112,6 +112,12 @@ class LatentPreviewMethod(enum.Enum): vram_group.add_argument("--novram", action="store_true", help="When lowvram isn't enough.") vram_group.add_argument("--cpu", action="store_true", help="To use the CPU for everything (slow).") +class CpuBf16Mode(enum.Enum): + Auto = "auto" + Yes = "yes" + No = "no" + +parser.add_argument("--use-cpu-bf16", type=CpuBf16Mode, default=CpuBf16Mode.Auto, help="When CPU mode is enabled use bf16 instructions to improve performance.", action=EnumAction) parser.add_argument("--default-hashing-function", type=str, choices=['md5', 'sha1', 'sha256', 'sha512'], default='sha256', help="Allows you to choose the hash function to use for duplicate filename / contents comparison. Default is sha256.") parser.add_argument("--disable-smart-memory", action="store_true", help="Force ComfyUI to agressively offload to regular ram instead of keeping models in vram when it can.") diff --git a/comfy/model_management.py b/comfy/model_management.py index fb27470152c..ff50b7932a1 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -1,7 +1,7 @@ import psutil import logging from enum import Enum -from comfy.cli_args import args +from comfy.cli_args import args, CpuBf16Mode import torch import sys import platform @@ -27,6 +27,7 @@ class CPUState(Enum): total_vram = 0 lowvram_available = True +ipex_available = False xpu_available = False if args.deterministic: @@ -71,6 +72,21 @@ def is_intel_xpu(): return True return False +def use_cpu_bf16(): + if args.use_cpu_bf16 == CpuBf16Mode.No or cpu_state != CPUState.CPU: + return False + if args.use_cpu_bf16 == CpuBf16Mode.Yes: + return True + + try: + from cpuinfo import get_cpu_info + except: + logging.warning('py-cpuinfo is not installed, rerun "pip install -r requirements.txt"') + return False + + cpu_info = get_cpu_info() + return 'avx512_bf16' in cpu_info['flags'] + def get_torch_device(): global directml_enabled global cpu_state @@ -183,10 +199,10 @@ def is_nvidia(): except: pass -if is_intel_xpu(): +if is_intel_xpu() or use_cpu_bf16(): VAE_DTYPES = [torch.bfloat16] + VAE_DTYPES -if args.cpu_vae: +if args.cpu_vae and not use_cpu_bf16(): VAE_DTYPES = [torch.float32] @@ -905,9 +921,8 @@ def should_use_fp16(device=None, model_params=0, prioritize_performance=True, ma return True def should_use_bf16(device=None, model_params=0, prioritize_performance=True, manual_cast=False): - if device is not None: - if is_device_cpu(device): #TODO ? bf16 works on CPU but is extremely slow - return False + if cpu_state == CPUState.CPU or device is not None and is_device_cpu(device): + return use_cpu_bf16() if device is not None: if is_device_mps(device): diff --git a/requirements.txt b/requirements.txt index 4c2c0b2b221..a68f60bd509 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,6 +13,7 @@ Pillow scipy tqdm psutil +py-cpuinfo #non essential dependencies: kornia>=0.7.1