Allow bf16 computations on CPUs with BF16 support

Modern CPUs have native AVX512 BF16 instructions, which significantly improves matmul and conv2d operations. With Bfloat16 instructions UNET steps are 40-50% faster on both AMD and Intel CPUs. There are minor visible changes with bf16, but no avalanche effects, so this feature is enabled by default with new `--use-cpu-bf16=auto` option. It can be disabled with `--use-cpu-bf16=no`. Signed-off-by: Sv. Lockal <[email protected]>
comfyanonymous · Aug 6, 2024 · 88f3f92 · 88f3f92
1 parent e545a63
commit 88f3f92
Show file tree

Hide file tree

Showing 4 changed files with 32 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -165,6 +165,10 @@ You can install ComfyUI in Apple Mac silicon (M1 or M2) with any recent macOS ve
 
 ```pip install torch-directml``` Then you can launch ComfyUI with: ```python main.py --directml```
 
+#### CPUs with AVX-512 BFloat16 support
+
+If you have a CPU with AVX-512 BFloat16 instruction set support, you can increase performance of KSampler node and decrease memory usage by approximately 50% by running `python main.py --cpu --bf16-vae --bf16-unet`.
+
 # Running
 
 ```python main.py```

diff --git a/comfy/cli_args.py b/comfy/cli_args.py
@@ -112,6 +112,12 @@ class LatentPreviewMethod(enum.Enum):
 vram_group.add_argument("--novram", action="store_true", help="When lowvram isn't enough.")
 vram_group.add_argument("--cpu", action="store_true", help="To use the CPU for everything (slow).")
 
+class CpuBf16Mode(enum.Enum):
+    Auto = "auto"
+    Yes = "yes"
+    No = "no"
+
+parser.add_argument("--use-cpu-bf16", type=CpuBf16Mode, default=CpuBf16Mode.Auto, help="When CPU mode is enabled use bf16 instructions to improve performance.", action=EnumAction)
 parser.add_argument("--default-hashing-function", type=str, choices=['md5', 'sha1', 'sha256', 'sha512'], default='sha256', help="Allows you to choose the hash function to use for duplicate filename / contents comparison. Default is sha256.")
 
 parser.add_argument("--disable-smart-memory", action="store_true", help="Force ComfyUI to agressively offload to regular ram instead of keeping models in vram when it can.")

diff --git a/comfy/model_management.py b/comfy/model_management.py
@@ -1,7 +1,7 @@
 import psutil
 import logging
 from enum import Enum
-from comfy.cli_args import args
+from comfy.cli_args import args, CpuBf16Mode
 import torch
 import sys
 import platform
@@ -27,6 +27,7 @@ class CPUState(Enum):
 total_vram = 0
 
 lowvram_available = True
+ipex_available = False
 xpu_available = False
 
 if args.deterministic:
@@ -71,6 +72,21 @@ def is_intel_xpu():
             return True
     return False
 
+def use_cpu_bf16():
+    if args.use_cpu_bf16 == CpuBf16Mode.No or cpu_state != CPUState.CPU:
+        return False
+    if args.use_cpu_bf16 == CpuBf16Mode.Yes:
+        return True
+
+    try:
+        from cpuinfo import get_cpu_info
+    except:
+        logging.warning('py-cpuinfo is not installed, rerun "pip install -r requirements.txt"')
+        return False
+
+    cpu_info = get_cpu_info()
+    return 'avx512_bf16' in cpu_info['flags']
+
 def get_torch_device():
     global directml_enabled
     global cpu_state
@@ -183,10 +199,10 @@ def is_nvidia():
 except:
     pass
 
-if is_intel_xpu():
+if is_intel_xpu() or use_cpu_bf16():
     VAE_DTYPES = [torch.bfloat16] + VAE_DTYPES
 
-if args.cpu_vae:
+if args.cpu_vae and not use_cpu_bf16():
     VAE_DTYPES = [torch.float32]
 
 
@@ -905,9 +921,8 @@ def should_use_fp16(device=None, model_params=0, prioritize_performance=True, ma
     return True
 
 def should_use_bf16(device=None, model_params=0, prioritize_performance=True, manual_cast=False):
-    if device is not None:
-        if is_device_cpu(device): #TODO ? bf16 works on CPU but is extremely slow
-            return False
+    if cpu_state == CPUState.CPU or device is not None and is_device_cpu(device):
+        return use_cpu_bf16()
 
     if device is not None:
         if is_device_mps(device):

diff --git a/requirements.txt b/requirements.txt
@@ -13,6 +13,7 @@ Pillow
 scipy
 tqdm
 psutil
+py-cpuinfo
 
 #non essential dependencies:
 kornia>=0.7.1