Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ctorch #318

Open
wants to merge 47 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
9226e9d
add support for adapting pytorch's c++ codes.
Exusial Apr 28, 2022
282fc15
update ctorch.
Exusial Apr 28, 2022
3770a5d
Merge branch 'master' into ctorch
Exusial Apr 28, 2022
5e4f24e
fix multiple definitions.
Exusial Apr 30, 2022
ad4ee50
fix linspace zero division.
Exusial Sep 12, 2022
20da64e
Add interpolate area support.
Exusial Sep 15, 2022
2b394c5
Merge branch 'master' into ctorch
Exusial Sep 15, 2022
1531e1f
Merge branch 'debug' into ctorch
Exusial Sep 15, 2022
72b73fa
update ctorch with ArrayRef.
Exusial Sep 18, 2022
cd68088
add documentation and tests.
Exusial Sep 18, 2022
d7ac1eb
pass nvdiffrec renderutils.
Exusial Sep 20, 2022
46737e0
Merge branch 'master' into debug
Exusial Sep 20, 2022
941941c
fix batched matmul.
Exusial Sep 20, 2022
917e956
Merge branch 'debug' into ctorch
Exusial Sep 20, 2022
a8f07be
add broadcast support for matmul and function backward.
Exusial Sep 21, 2022
6ed2602
update ldflags.
Exusial Sep 21, 2022
d8801ae
add atan2.
Exusial Sep 22, 2022
4dff6e3
fix cuda arch.
Exusial Sep 22, 2022
5e78237
fix arrayref.
Exusial Sep 23, 2022
34e7dd7
delete log.
Exusial Sep 29, 2022
0c632eb
add int64 support.
Exusial Sep 30, 2022
409945e
fix int64 sum.
Exusial Oct 5, 2022
385c917
fix None.
Exusial Oct 14, 2022
f8ad90b
Merge branch 'master' into ctorch
Exusial Oct 21, 2022
f59119f
fix cuda.
Exusial Oct 21, 2022
8d9e725
fix cuda.
Exusial Oct 21, 2022
b29fb87
fix cuda.
Exusial Oct 21, 2022
34215c5
update.
Exusial Oct 21, 2022
8a3b50a
update dtype.
Exusial Oct 21, 2022
62431c9
update dtype.
Exusial Oct 22, 2022
60a551d
update header.
Exusial Oct 22, 2022
61a501c
fix type.
Exusial Oct 26, 2022
653147c
add ~.
Exusial Oct 27, 2022
cb483cf
fix release bug.
Exusial Oct 28, 2022
9a99f6c
fix memory.
Exusial Oct 31, 2022
30be2dc
add ninja build support.
Exusial Nov 3, 2022
46f4ac3
fix memroy leak.
Exusial Nov 15, 2022
40db8bd
merge.
Exusial Nov 26, 2022
cd0121f
Merge branch 'master' into ctorch
Feb 6, 2023
fe522ad
update compatibilty.
Feb 7, 2023
a474e4a
update ctorch stream process.
Feb 23, 2023
f8cb69a
update compiler.
Exusial Feb 23, 2023
3144e1e
update for optix.
Feb 27, 2023
7f30726
set up.
Exusial Feb 27, 2023
672c056
Merge branch 'ctorch' of github.com:Exusial/jittor into ctorch
Exusial Feb 27, 2023
2853904
add ctorch verbose mode.
Exusial Feb 28, 2023
e760279
fix load pytorch.
Exusial Mar 1, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 35 additions & 2 deletions python/jittor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1703,10 +1703,20 @@ def grad(self, grad0, grad1):
assert db.data == 0

'''
def __init__(self):
super().__init__(self)
self.saved_tensors = []
self.saved_variables = self.saved_tensors

def save_for_backward(self, *args):
self.saved_tensors = list(args)
self.saved_variables = self.saved_tensors

def __call__(self, *args):
backup = args
args = list(args)
taped_inputs = []
self.taped_inputs_shape = []
taped_outputs = []
input_mask = [-1] * len(args)
for i,v in enumerate(args):
Expand All @@ -1719,6 +1729,7 @@ def __call__(self, *args):
input_mask[i] = len(taped_inputs)
args[i] = v
taped_inputs.append(v)
self.taped_inputs_shape.append(v.shape)
ori_res = self.execute(*args)
if not isinstance(ori_res, Sequence):
res = [ori_res]
Expand All @@ -1730,12 +1741,14 @@ def __call__(self, *args):
v = v.tape()
output_mask[i] = len(taped_outputs)
res[i] = v
taped_outputs.append(v)
if not v.is_stop_grad():
taped_outputs.append(v)
self.input_mask = input_mask
self.output_mask = output_mask
# tape output and input together so
# backward treat them as one operator
tape_together(taped_inputs, taped_outputs, self._grad)
if len(taped_outputs) > 0:
tape_together(taped_inputs, taped_outputs, self._grad)
if isinstance(ori_res, Sequence):
return res
else:
Expand All @@ -1747,13 +1760,33 @@ def _grad(self, *args):
if not isinstance(ret, Sequence):
ret = (ret,)
new_ret = []
# print("function grad: ", self.__class__, len(ret), len(self.input_mask), self.input_mask)
for i, r in enumerate(ret):
j = self.input_mask[i]
if j<0:
# -2 in input_mask represents it is stop_grad
assert r is None or j==-2, f"{type(self)}'s {i}-th returned grad should be None, "\
"because the input value is not jittor variable."
else:
# detect if all the input dims are the same
if r is not None:
input_shape = self.taped_inputs_shape[j]
same_idx = -1
for i in range(1, len(input_shape) + 1):
if input_shape[-i] == r.shape[-i]:
same_idx = i
else:
break
# print(same_idx, input_shape, r.shape)
if same_idx == -1:
r = r.reshape(input_shape[0], -1).sum(1)
elif same_idx < len(input_shape):
r_shape_prod = np.prod(r.shape[:-same_idx])
input_shape_prod = np.prod(input_shape[:-same_idx])
r = r.reshape(input_shape[:-same_idx] + [ori_int(r_shape_prod // input_shape_prod)] + input_shape[-same_idx:]).sum(len(input_shape) - same_idx)
elif same_idx == len(input_shape) and len(r.shape) > len(input_shape):
while len(r.shape) > len(input_shape):
r = r.sum(0)
new_ret.append(r)
return new_ret

Expand Down
288 changes: 288 additions & 0 deletions python/jittor/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from ctypes import cdll
from ctypes.util import find_library

import jittor as jt
import jittor_utils as jit_utils
from jittor_utils import LOG, run_cmd, find_exe, cc_path, cc_type, cache_path
from . import pyjt_compiler
Expand Down Expand Up @@ -866,6 +867,7 @@ def check_cuda():
else:
cc_flags += f" -lcudart -L\"{cuda_lib}\" "
# ctypes.CDLL(cuda_lib+"/libcudart.so", import_flags)
print("cudart: ", cuda_lib+"/libcudart.so")
ctypes.CDLL(cuda_lib+"/libcudart.so", dlopen_flags)
is_cuda = has_cuda = 1

Expand Down Expand Up @@ -1401,3 +1403,289 @@ def func(x):
flags.has_pybt = has_pybt

core.set_lock_path(lock.lock_path)

def _is_cuda_file(path: str) -> bool:
valid_ext = ['.cu', '.cuh']
return os.path.splitext(path)[1] in valid_ext

def object_file_path(source_file, with_cuda=True) -> str:
# '/path/to/file.cpp' -> 'file'
file_name = os.path.splitext(os.path.basename(source_file))[0]
if _is_cuda_file(source_file) and with_cuda:
# Use a different object filename in case a C++ and CUDA file have
# the same filename but different extension (.cpp vs. .cu).
target = f'{file_name}.cuda.o'
else:
target = f'{file_name}.o'
return target

def _write_ninja_file(path,
compiler,
nvcc,
cflags,
post_cflags,
cuda_cflags,
cuda_post_cflags,
sources,
objects,
ldflags,
library_target,
with_cuda) -> None:
r"""Write a ninja file that does the desired compiling and linking.

`path`: Where to write this file
`cflags`: list of flags to pass to $cxx. Can be None.
`post_cflags`: list of flags to append to the $cxx invocation. Can be None.
`cuda_cflags`: list of flags to pass to $nvcc. Can be None.
`cuda_postflags`: list of flags to append to the $nvcc invocation. Can be None.
`sources`: list of paths to source files
`objects`: list of desired paths to objects, one per source.
`ldflags`: list of flags to pass to linker. Can be None.
`library_target`: Name of the output library. Can be None; in that case,
we do no linking.
`with_cuda`: If we should be compiling with CUDA.
"""
def sanitize_flags(flags):
if flags is None:
return []
else:
return [flag.strip() for flag in flags]

cflags = sanitize_flags(cflags)
post_cflags = sanitize_flags(post_cflags)
cuda_cflags = sanitize_flags(cuda_cflags)
cuda_post_cflags = sanitize_flags(cuda_post_cflags)
ldflags = sanitize_flags(ldflags)

# Sanity checks...
assert len(sources) == len(objects)
assert len(sources) > 0

# Version 1.3 is required for the `deps` directive.
config = ['ninja_required_version = 1.3']
config.append(f'cxx = {compiler}')
if with_cuda:
# if IS_HIP_EXTENSION:
# nvcc = _join_rocm_home('bin', 'hipcc')
# nvcc = _join_cuda_home('bin', 'nvcc')
config.append(f'nvcc = {nvcc}')

# if IS_HIP_EXTENSION:
# post_cflags = COMMON_HIP_FLAGS + post_cflags
flags = [f'cflags = {" ".join(cflags)}']
flags.append(f'post_cflags = {" ".join(post_cflags)}')
if with_cuda:
flags.append(f'cuda_cflags = {" ".join(cuda_cflags)}')
flags.append(f'cuda_post_cflags = {" ".join(cuda_post_cflags)}')
flags.append(f'ldflags = {" ".join(ldflags)}')

# Turn into absolute paths so we can emit them into the ninja build
# file wherever it is.
sources = [os.path.abspath(file) for file in sources]

# See https://ninja-build.org/build.ninja.html for reference.
compile_rule = ['rule compile']
# if IS_WINDOWS:
# compile_rule.append(
# ' command = cl /showIncludes $cflags -c $in /Fo$out $post_cflags')
# compile_rule.append(' deps = msvc')
# else:
compile_rule.append(
' command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags')
compile_rule.append(' depfile = $out.d')
compile_rule.append(' deps = gcc')

if with_cuda:
cuda_compile_rule = ['rule cuda_compile']
nvcc_gendeps = ' -MMD -MF $out.d'
# --generate-dependencies-with-compile was added in CUDA 10.2.
# Compilation will work on earlier CUDA versions but header file
# dependencies are not correctly computed.
# required_cuda_version = .version.parse('10.2')
# has_cuda_version = torch.version.cuda is not None
# if has_cuda_version and packaging.version.parse(torch.version.cuda) >= required_cuda_version:
cuda_compile_rule.append(' depfile = $out.d')
cuda_compile_rule.append(' deps = gcc')
# Note: non-system deps with nvcc are only supported
# on Linux so use --generate-dependencies-with-compile
# to make this work on Windows too.
if sys.platform == "win32":
nvcc_gendeps = '--generate-dependencies-with-compile --dependency-output $out.d'
cuda_compile_rule.append(
f' command = $nvcc {nvcc_gendeps} $cuda_cflags -c $in -o $out $cuda_post_cflags')

# Emit one build rule per source to enable incremental build.
build = []
for source_file, object_file in zip(sources, objects):
is_cuda_source = _is_cuda_file(source_file) and with_cuda
rule = 'cuda_compile' if is_cuda_source else 'compile'
# if IS_WINDOWS:
# source_file = source_file.replace(':', '$:')
# object_file = object_file.replace(':', '$:')
source_file = source_file.replace(" ", "$ ")
object_file = object_file.replace(" ", "$ ")
build.append(f'build {object_file}: {rule} {source_file}')

if library_target is not None:
link_rule = ['rule link']
# if IS_WINDOWS:
# cl_paths = subprocess.check_output(['where',
# 'cl']).decode(*SUBPROCESS_DECODE_ARGS).split('\r\n')
# if len(cl_paths) >= 1:
# cl_path = os.path.dirname(cl_paths[0]).replace(':', '$:')
# else:
# raise RuntimeError("MSVC is required to load C++ extensions")
# link_rule.append(f' command = "{cl_path}/link.exe" $in /nologo $ldflags /out:$out')
# else:
link_rule.append(' command = $cxx $in $ldflags -shared -o $out')

link = [f'build {library_target}: link {" ".join(objects)}']

default = [f'default {library_target}']
else:
link_rule, link, default = [], [], []

# 'Blocks' should be separated by newlines, for visual benefit.
blocks = [config, flags, compile_rule]
if with_cuda:
blocks.append(cuda_compile_rule)
blocks += [link_rule, build, link, default]
with open(path, 'w') as build_file:
for block in blocks:
lines = '\n'.join(block)
build_file.write(f'{lines}\n\n')

def _run_ninja_build(build_directory, verbose=False, error_prefix="") -> None:
SUBPROCESS_DECODE_ARGS = ()
command = ['ninja', '-v']
num_workers = 8
if num_workers is not None:
command.extend(['-j', str(num_workers)])
env = os.environ.copy()
# os.system(f"cd {build_directory} && {' '.join(command)}")
try:
sys.stdout.flush()
sys.stderr.flush()
# Warning: don't pass stdout=None to subprocess.run to get output.
# subprocess.run assumes that sys.__stdout__ has not been modified and
# attempts to write to it by default. However, when we call _run_ninja_build
# from ahead-of-time cpp extensions, the following happens:
# 1) If the stdout encoding is not utf-8, setuptools detachs __stdout__.
# https://github.com/pypa/setuptools/blob/7e97def47723303fafabe48b22168bbc11bb4821/setuptools/dist.py#L1110
# (it probably shouldn't do this)
# 2) subprocess.run (on POSIX, with no stdout override) relies on
# __stdout__ not being detached:
# https://github.com/python/cpython/blob/c352e6c7446c894b13643f538db312092b351789/Lib/subprocess.py#L1214
# To work around this, we pass in the fileno directly and hope that
# it is valid.
stdout_fileno = 1
sp.run(
command,
stdout=stdout_fileno if verbose else sp.PIPE,
stderr=sp.STDOUT,
cwd=build_directory,
check=True,
env=env)
except sp.CalledProcessError as e:
# Python 2 and 3 compatible way of getting the error object.
_, error, _ = sys.exc_info()
# error.output contains the stdout and stderr of the build attempt.
message = error_prefix
# `error` is a CalledProcessError (which has an `ouput`) attribute, but
# mypy thinks it's Optional[BaseException] and doesn't narrow
if hasattr(error, 'output') and error.output: # type: ignore[union-attr]
message += f": {error.output.decode(*SUBPROCESS_DECODE_ARGS)}" # type: ignore[union-attr]
raise RuntimeError(message) from e

def compile_torch_extensions(extension_name, sources, extra_cflags=None, extra_cuda_cflags=None, extra_ldflags=None, use_cuda=0, force_compile=0, verbose=False):
if not use_cuda:
use_cuda = all(map(_is_cuda_file, sources))
if use_cuda:
compiler = jt.flags.nvcc_path
else:
compiler = jt.flags.cc_path
suffix = os.popen("python3-config --extension-suffix").read().replace("\n","")
if os.path.exists(extension_name+suffix) and not force_compile:
return 0
jittor_src_path = os.path.join(jittor_path, "src")
assert (isinstance(sources, str) or isinstance(sources, list)), "must input lists or concatenated string of source files"
# add ctorch files.
sources += [f"{jittor_src_path}/ctorch/torch/extension.cpp", f"{jittor_src_path}/ctorch/ATen/cuda/CUDAUtils.cpp"]
objects = [object_file_path(fn) for fn in sources]
extra_flags = extra_cflags + extra_ldflags if use_cuda == 0 else extra_cuda_cflags + extra_ldflags
extra_flags = " ".join(extra_flags)
# test write ninja files
jittor_c_include = f" -I{jittor_src_path} -I{jittor_src_path}/ctorch -I{jt.flags.jittor_path}/extern/cuda/inc/ "
pybind_include = os.popen("python3 -m pybind11 --includes").read().strip()
cflags = [jittor_c_include + " -I/usr/local/cuda/include ",
"$$(python3 -m pybind11 --includes)",
"-g",
f"-DTORCH_EXTENSION_NAME={extension_name}",
"-O3",
"-DHAS_CUDA",
"-shared",
"-std=c++17",
"-fPIC",
"-DHAS_CUDA"
]
cflags += extra_cflags
post_cflags = []
cuda_cflags = [jittor_c_include,
"-g",
f"-DTORCH_EXTENSION_NAME={extension_name}",
"-O3",
"-shared",
"-std=c++17",
"-fPIC",
"--forward-unknown-to-host-compiler --use_fast_math --expt-relaxed-constexpr",
"-DHAS_CUDA",
"--allow-unsupported-compiler",
f"-gencode arch=compute_{flags.cuda_archs[0]},code=compute_{flags.cuda_archs[0]}"
]
cuda_post_cflags = []
cuda_cflags += extra_cuda_cflags
if verbose:
cuda_cflags += ["-keep"]
python_ldflags = [x if "config" not in x else "" for x in os.popen("python3-config --ldflags").read().split(" ")]
ldflags = python_ldflags + ["-lgcc"]
# add jittor lib:
core_path = jt.flags.cache_path
util_path = os.path.sep.join(core_path.split(os.path.sep)[:-1])
ldflags += [f' -L"{core_path}" -L"{util_path}" -Wl,-rpath "{core_path}" -l:{"jittor_core"+extension_suffix} -Wl,-rpath "{util_path}" -l:{"jit_utils_core"+extension_suffix}']
# add cuda lib:
if use_cuda:
cuda_path = os.path.sep.join(nvcc_path.split(os.path.sep)[:-2]+["lib64"])
ldflags += [f'-L{cuda_path} -Wl,-rpath "{cuda_path}" -lcusparse']
ldflags += extra_ldflags
pybind_suffix = os.popen("python3-config --extension-suffix").read().strip()
library_target = f"{extension_name}{pybind_suffix}"
path = os.path.join(jt.flags.cache_path, "build_cache", extension_name)
if not os.path.exists(path):
os.makedirs(path)
sys.path.append(path)
# print(jt.flags.cache_path)
_write_ninja_file(os.path.join(path, "build.ninja"),
jt.flags.cc_path,
jt.flags.nvcc_path,
cflags,
post_cflags,
cuda_cflags,
cuda_post_cflags,
sources,
objects,
ldflags,
library_target,
use_cuda)
_run_ninja_build(path, verbose=False, error_prefix=f"Error building extension '{extension_name}'")
# directly use nvcc or gcc to compile
# if not isinstance(sources, str):
# sources = " ".join(sources)
# compile_command = f"{compiler} {sources} -I{jittor_src_path} -I{jittor_src_path}/ctorch -g -DTORCH_EXTENSION_NAME={extension_name} -O3 -shared -std=c++17 --forward-unknown-to-host-compiler --use_fast_math --expt-relaxed-constexpr -fPIC -DHAS_CUDA -gencode arch=compute_{flags.cuda_archs[0]},code=compute_{flags.cuda_archs[0]} -lcusparse {extra_flags} -I{jittor_path}/extern/cuda/inc/ --allow-unsupported-compiler $(python3 -m pybind11 --includes) -o {extension_name}$(python3-config --extension-suffix)"
# status = os.system(compile_command)
# print(compile_command)
# if status != 0:
# print("=========\nCompile failed. If you are compiling CUDA ops, please set use_cuda to 1 in the parameters.\n=========")
# return status

def _get_build_directory(extension_name, empty=False):
return os.path.join(cache_path, extension_name)
Loading