Skip to content

Commit

Permalink
Allow bf16 computations on CPUs with BF16 support
Browse files Browse the repository at this point in the history
Modern CPUs have native AVX512 BF16 instructions, which significantly improves
matmul and conv2d operations.

With Bfloat16 instructions UNET steps are 40-50% faster on both AMD and Intel CPUs.
There are minor visible changes with bf16, but no avalanche effects, so this feature
is enabled by default with new `--use-cpu-bf16=auto` option.
It can be disabled with `--use-cpu-bf16=no`.

Signed-off-by: Sv. Lockal <[email protected]>
  • Loading branch information
AngryLoki committed Aug 6, 2024
1 parent e545a63 commit 88f3f92
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 6 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,10 @@ You can install ComfyUI in Apple Mac silicon (M1 or M2) with any recent macOS ve

```pip install torch-directml``` Then you can launch ComfyUI with: ```python main.py --directml```

#### CPUs with AVX-512 BFloat16 support

If you have a CPU with AVX-512 BFloat16 instruction set support, you can increase performance of KSampler node and decrease memory usage by approximately 50% by running `python main.py --cpu --bf16-vae --bf16-unet`.

# Running

```python main.py```
Expand Down
6 changes: 6 additions & 0 deletions comfy/cli_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,12 @@ class LatentPreviewMethod(enum.Enum):
vram_group.add_argument("--novram", action="store_true", help="When lowvram isn't enough.")
vram_group.add_argument("--cpu", action="store_true", help="To use the CPU for everything (slow).")

class CpuBf16Mode(enum.Enum):
Auto = "auto"
Yes = "yes"
No = "no"

parser.add_argument("--use-cpu-bf16", type=CpuBf16Mode, default=CpuBf16Mode.Auto, help="When CPU mode is enabled use bf16 instructions to improve performance.", action=EnumAction)
parser.add_argument("--default-hashing-function", type=str, choices=['md5', 'sha1', 'sha256', 'sha512'], default='sha256', help="Allows you to choose the hash function to use for duplicate filename / contents comparison. Default is sha256.")

parser.add_argument("--disable-smart-memory", action="store_true", help="Force ComfyUI to agressively offload to regular ram instead of keeping models in vram when it can.")
Expand Down
27 changes: 21 additions & 6 deletions comfy/model_management.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import psutil
import logging
from enum import Enum
from comfy.cli_args import args
from comfy.cli_args import args, CpuBf16Mode
import torch
import sys
import platform
Expand All @@ -27,6 +27,7 @@ class CPUState(Enum):
total_vram = 0

lowvram_available = True
ipex_available = False
xpu_available = False

if args.deterministic:
Expand Down Expand Up @@ -71,6 +72,21 @@ def is_intel_xpu():
return True
return False

def use_cpu_bf16():
if args.use_cpu_bf16 == CpuBf16Mode.No or cpu_state != CPUState.CPU:
return False
if args.use_cpu_bf16 == CpuBf16Mode.Yes:
return True

try:
from cpuinfo import get_cpu_info
except:
logging.warning('py-cpuinfo is not installed, rerun "pip install -r requirements.txt"')
return False

cpu_info = get_cpu_info()
return 'avx512_bf16' in cpu_info['flags']

def get_torch_device():
global directml_enabled
global cpu_state
Expand Down Expand Up @@ -183,10 +199,10 @@ def is_nvidia():
except:
pass

if is_intel_xpu():
if is_intel_xpu() or use_cpu_bf16():
VAE_DTYPES = [torch.bfloat16] + VAE_DTYPES

if args.cpu_vae:
if args.cpu_vae and not use_cpu_bf16():
VAE_DTYPES = [torch.float32]


Expand Down Expand Up @@ -905,9 +921,8 @@ def should_use_fp16(device=None, model_params=0, prioritize_performance=True, ma
return True

def should_use_bf16(device=None, model_params=0, prioritize_performance=True, manual_cast=False):
if device is not None:
if is_device_cpu(device): #TODO ? bf16 works on CPU but is extremely slow
return False
if cpu_state == CPUState.CPU or device is not None and is_device_cpu(device):
return use_cpu_bf16()

if device is not None:
if is_device_mps(device):
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Pillow
scipy
tqdm
psutil
py-cpuinfo

#non essential dependencies:
kornia>=0.7.1
Expand Down

0 comments on commit 88f3f92

Please sign in to comment.