diff --git a/.clang-format b/.clang-format index 48b2c678..f007ce43 100644 --- a/.clang-format +++ b/.clang-format @@ -1,192 +1,3 @@ --- -Language: Cpp -# BasedOnStyle: LLVM -AccessModifierOffset: -2 -AlignAfterOpenBracket: Align -AlignArrayOfStructures: None -AlignConsecutiveMacros: None -AlignConsecutiveAssignments: None -AlignConsecutiveBitFields: None -AlignConsecutiveDeclarations: None -AlignEscapedNewlines: Right -AlignOperands: Align -AlignTrailingComments: true -AllowAllArgumentsOnNextLine: true -AllowAllParametersOfDeclarationOnNextLine: true -AllowShortEnumsOnASingleLine: true -AllowShortBlocksOnASingleLine: Never -AllowShortCaseLabelsOnASingleLine: false -AllowShortFunctionsOnASingleLine: All -AllowShortLambdasOnASingleLine: All -AllowShortIfStatementsOnASingleLine: Never -AllowShortLoopsOnASingleLine: false -AlwaysBreakAfterDefinitionReturnType: None -AlwaysBreakAfterReturnType: None -AlwaysBreakBeforeMultilineStrings: false -AlwaysBreakTemplateDeclarations: MultiLine -AttributeMacros: - - __capability -BinPackArguments: true -BinPackParameters: true -BraceWrapping: - AfterCaseLabel: false - AfterClass: false - AfterControlStatement: Never - AfterEnum: false - AfterFunction: false - AfterNamespace: false - AfterObjCDeclaration: false - AfterStruct: false - AfterUnion: false - AfterExternBlock: false - BeforeCatch: false - BeforeElse: false - BeforeLambdaBody: false - BeforeWhile: false - IndentBraces: false - SplitEmptyFunction: true - SplitEmptyRecord: true - SplitEmptyNamespace: true -BreakBeforeBinaryOperators: None -BreakBeforeConceptDeclarations: true -BreakBeforeBraces: Attach -BreakBeforeInheritanceComma: false -BreakInheritanceList: BeforeColon -BreakBeforeTernaryOperators: true -BreakConstructorInitializersBeforeComma: false -BreakConstructorInitializers: BeforeColon -BreakAfterJavaFieldAnnotations: false -BreakStringLiterals: true -ColumnLimit: 80 -CommentPragmas: '^ IWYU pragma:' -QualifierAlignment: Leave -CompactNamespaces: false -ConstructorInitializerIndentWidth: 4 -ContinuationIndentWidth: 4 -Cpp11BracedListStyle: true -DeriveLineEnding: true -DerivePointerAlignment: false -DisableFormat: false -EmptyLineAfterAccessModifier: Never -EmptyLineBeforeAccessModifier: LogicalBlock -ExperimentalAutoDetectBinPacking: false -PackConstructorInitializers: BinPack -BasedOnStyle: '' -ConstructorInitializerAllOnOneLineOrOnePerLine: false -AllowAllConstructorInitializersOnNextLine: true -FixNamespaceComments: true -ForEachMacros: - - foreach - - Q_FOREACH - - BOOST_FOREACH -IfMacros: - - KJ_IF_MAYBE -IncludeBlocks: Preserve -IncludeCategories: - - Regex: '^"(llvm|llvm-c|clang|clang-c)/' - Priority: 2 - SortPriority: 0 - CaseSensitive: false - - Regex: '^(<|"(gtest|gmock|isl|json)/)' - Priority: 3 - SortPriority: 0 - CaseSensitive: false - - Regex: '.*' - Priority: 1 - SortPriority: 0 - CaseSensitive: false -IncludeIsMainRegex: '(Test)?$' -IncludeIsMainSourceRegex: '' -IndentAccessModifiers: false -IndentCaseLabels: false -IndentCaseBlocks: false -IndentGotoLabels: true -IndentPPDirectives: None -IndentExternBlock: AfterExternBlock -IndentRequires: false -IndentWidth: 2 -IndentWrappedFunctionNames: false -InsertTrailingCommas: None -JavaScriptQuotes: Leave -JavaScriptWrapImports: true -KeepEmptyLinesAtTheStartOfBlocks: true -LambdaBodyIndentation: Signature -MacroBlockBegin: '' -MacroBlockEnd: '' -MaxEmptyLinesToKeep: 1 -NamespaceIndentation: None -ObjCBinPackProtocolList: Auto -ObjCBlockIndentWidth: 2 -ObjCBreakBeforeNestedBlockParam: true -ObjCSpaceAfterProperty: false -ObjCSpaceBeforeProtocolList: true -PenaltyBreakAssignment: 2 -PenaltyBreakBeforeFirstCallParameter: 19 -PenaltyBreakComment: 300 -PenaltyBreakFirstLessLess: 120 -PenaltyBreakOpenParenthesis: 0 -PenaltyBreakString: 1000 -PenaltyBreakTemplateDeclaration: 10 -PenaltyExcessCharacter: 1000000 -PenaltyReturnTypeOnItsOwnLine: 60 -PenaltyIndentedWhitespace: 0 -PointerAlignment: Right -PPIndentWidth: -1 -ReferenceAlignment: Pointer -ReflowComments: true -RemoveBracesLLVM: false -SeparateDefinitionBlocks: Leave -ShortNamespaceLines: 1 -SortIncludes: CaseSensitive -SortJavaStaticImport: Before -SortUsingDeclarations: true -SpaceAfterCStyleCast: false -SpaceAfterLogicalNot: false -SpaceAfterTemplateKeyword: true -SpaceBeforeAssignmentOperators: true -SpaceBeforeCaseColon: false -SpaceBeforeCpp11BracedList: false -SpaceBeforeCtorInitializerColon: true -SpaceBeforeInheritanceColon: true -SpaceBeforeParens: ControlStatements -SpaceBeforeParensOptions: - AfterControlStatements: true - AfterForeachMacros: true - AfterFunctionDefinitionName: false - AfterFunctionDeclarationName: false - AfterIfMacros: true - AfterOverloadedOperator: false - BeforeNonEmptyParentheses: false -SpaceAroundPointerQualifiers: Default -SpaceBeforeRangeBasedForLoopColon: true -SpaceInEmptyBlock: false -SpaceInEmptyParentheses: false -SpacesBeforeTrailingComments: 1 -SpacesInAngles: Never -SpacesInConditionalStatement: false -SpacesInContainerLiterals: true -SpacesInCStyleCastParentheses: false -SpacesInLineCommentPrefix: - Minimum: 1 - Maximum: -1 -SpacesInParentheses: false -SpacesInSquareBrackets: false -SpaceBeforeSquareBrackets: false -BitFieldColonSpacing: Both -Standard: Latest -StatementAttributeLikeMacros: - - Q_EMIT -StatementMacros: - - Q_UNUSED - - QT_REQUIRE_VERSION -TabWidth: 8 -UseCRLF: false -UseTab: Never -WhitespaceSensitiveMacros: - - STRINGIZE - - PP_STRINGIZE - - BOOST_PP_STRINGIZE - - NS_SWIFT_NAME - - CF_SWIFT_NAME +BasedOnStyle: LLVM ... - diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index f4003558..e3666317 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -42,3 +42,37 @@ jobs: -DLLVM_DIR=/usr/lib/llvm-${{ matrix.llvm_version }}/cmake \ .. make + llvm_compatibility_latest_llvm: + runs-on: ubuntu-22.04 + strategy: + matrix: + llvm_version: [16, 17] + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: Add LLVM project deb repository + uses: myci-actions/add-deb-repo@11 + with: + repo: deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-${{ matrix.llvm_version }} main + repo-name: llvm + update: false + keys-asc: https://apt.llvm.org/llvm-snapshot.gpg.key + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y \ + llvm-${{ matrix.llvm_version }}-dev \ + libz3-dev \ + python2 + - name: Build SymCC with the QSYM backend + run: | + mkdir build + cd build + cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DZ3_TRUST_SYSTEM_VERSION=ON \ + -DQSYM_BACKEND=ON \ + -DLLVM_DIR=/usr/lib/llvm-${{ matrix.llvm_version }}/cmake \ + .. + make diff --git a/CMakeLists.txt b/CMakeLists.txt index 7e5f9309..8ce44bb8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -82,8 +82,8 @@ find_package(LLVM REQUIRED CONFIG) message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}") message(STATUS "Using LLVMConfig.cmake from ${LLVM_DIR}") -if (${LLVM_VERSION_MAJOR} LESS 8 OR ${LLVM_VERSION_MAJOR} GREATER 16) - message(WARNING "The software has been developed for LLVM 8 through 16; \ +if (${LLVM_VERSION_MAJOR} LESS 8 OR ${LLVM_VERSION_MAJOR} GREATER 17) + message(WARNING "The software has been developed for LLVM 8 through 17; \ it is unlikely to work with other versions!") endif() diff --git a/Dockerfile b/Dockerfile index 1d695ac0..5d93d01d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,19 +15,19 @@ # # The base image # -FROM ubuntu:20.04 AS builder +FROM ubuntu:22.04 AS builder # Install dependencies RUN apt-get update \ && DEBIAN_FRONTEND=noninteractive apt-get install -y \ cargo \ - clang-10 \ + clang-15 \ cmake \ g++ \ git \ libz3-dev \ - llvm-10-dev \ - llvm-10-tools \ + llvm-15-dev \ + llvm-15-tools \ ninja-build \ python2 \ python3-pip \ @@ -42,7 +42,7 @@ RUN git clone -b v2.56b https://github.com/google/AFL.git afl \ # Download the LLVM sources already so that we don't need to get them again when # SymCC changes -RUN git clone -b llvmorg-10.0.1 --depth 1 https://github.com/llvm/llvm-project.git /llvm_source +RUN git clone -b llvmorg-15.0.0 --depth 1 https://github.com/llvm/llvm-project.git /llvm_source # Build a version of SymCC with the simple backend to compile libc++ COPY . /symcc_source @@ -105,14 +105,14 @@ RUN cmake -G Ninja \ # # The final image # -FROM ubuntu:20.04 +FROM ubuntu:22.04 RUN apt-get update \ && DEBIAN_FRONTEND=noninteractive apt-get install -y \ build-essential \ - clang-10 \ + clang-15 \ g++ \ - libllvm10 \ + libllvm15 \ zlib1g \ sudo \ && rm -rf /var/lib/apt/lists/* \ @@ -127,8 +127,8 @@ COPY --from=builder_qsym /afl /afl ENV PATH /symcc_build:$PATH ENV AFL_PATH /afl -ENV AFL_CC clang-10 -ENV AFL_CXX clang++-10 +ENV AFL_CC clang-15 +ENV AFL_CXX clang++-15 ENV SYMCC_LIBCXX_PATH=/libcxx_symcc_install USER ubuntu diff --git a/README.md b/README.md index a15eea18..9f4ac734 100644 --- a/README.md +++ b/README.md @@ -9,14 +9,14 @@ of PRs, we will try to merge them when possible. # SymCC: efficient compiler-based symbolic execution -SymCC is a compiler wrapper which embeds symbolic execution into the program +SymCC is a compiler pass which embeds symbolic execution into the program during compilation, and an associated run-time support library. In essence, the compiler inserts code that computes symbolic expressions for each value in the program. The actual computation happens through calls to the support library at run time. To build the pass and the support library, install LLVM (any version between 8 -and 16) and Z3 (version 4.5 or later), as well as a C++ compiler with support +and 17) and Z3 (version 4.5 or later), as well as a C++ compiler with support for C++17. LLVM lit is only needed to run the tests; if it's not packaged with your LLVM, you can get it with `pip install lit`. diff --git a/compiler/Main.cpp b/compiler/Main.cpp index 0cf97501..ce43af74 100644 --- a/compiler/Main.cpp +++ b/compiler/Main.cpp @@ -13,7 +13,9 @@ // SymCC. If not, see . #include +#if LLVM_VERSION_MAJOR <= 15 #include +#endif #include #include diff --git a/docs/32-bit.txt b/docs/32-bit.txt index f5f6dda8..eae0e9cb 100644 --- a/docs/32-bit.txt +++ b/docs/32-bit.txt @@ -26,12 +26,12 @@ install the 32-bit version of Z3 in your system. Once the dependencies for 32-bit SymCC are available (as well as the 64-bit dependencies mentioned in the main README), configure and build SymCC as usual but add "-DTARGET_32BIT=ON" to the CMake invocation. If the build system doesn't -find your 32-bit versions of LLVM and Z3, specify their locations with +find your 32-bit versions of LLVM and Z3, and specify their locations with "-DLLVM_32BIT_DIR=/some/path" and "-DZ3_32BIT_DIR=/some/other/path", respectively - analogously to how you would hint at the 64-bit versions. The resulting "symcc" and "sym++" scripts work like regular SymCC, but they -additionally understand the "-m32" switch, which tells clang to build 32-bit +additionally understand the "-m32" switch, which tells Clang to build 32-bit artifacts. If you build anything with "-m32", SymCC will make sure that the 32-bit version of the symbolic backend is linked to it instead of the 64-bit variant that would normally be used. Note that, in order to compile C++ code diff --git a/docs/Backends.txt b/docs/Backends.txt index 358412b1..ec3e53ff 100644 --- a/docs/Backends.txt +++ b/docs/Backends.txt @@ -13,7 +13,7 @@ used. Also, we always link against "libSymRuntime.so", so the choice of backend is deferred until run time. From the target program's point of view, the only requirement on a backend is that it be a shared library with the expected name that implements the interface defined in runtime/RuntimeCommon.h (with type -"SymExpr" defined to be something of pointer width). +"SymExpr" is defined to be something of pointer width). Depending on the build option QSYM_BACKEND we build either our own backend or parts of QSYM (which are pulled in via a git submodule) and a small translation diff --git a/docs/Concreteness.txt b/docs/Concreteness.txt index 91eba7a2..26d0e9a1 100644 --- a/docs/Concreteness.txt +++ b/docs/Concreteness.txt @@ -18,9 +18,9 @@ There are two stages at which data can be identified as concrete: If we detect in the compiler pass that a value is a compile-time constant (case 1 above), we do not emit code for symbolic handling at all. However, for any -other type of data we need to generate code that handles the case of it being +other type of data, we need to generate code that handles the case of it being symbolic at run time. Concretely (no pun intended), we mark concrete values at -run time by setting its corresponding symbolic expression in shadow memory to +run time by setting their corresponding symbolic expression in shadow memory to null. This makes it very cheap to check concreteness during execution: just run a null check on the symbolic expression. diff --git a/docs/Configuration.txt b/docs/Configuration.txt index f36d7605..e1239151 100644 --- a/docs/Configuration.txt +++ b/docs/Configuration.txt @@ -9,7 +9,7 @@ SymCC is configured at two different stages: compilation time and the set of dependencies. This is done via arguments to CMake. -2. When you run programs that have been compiled with SymCC, environment +2. When you run programs that have been compiled with SymCC, the environment variables control various aspects of the execution and analysis. We list all available options for each stage in turn. @@ -57,7 +57,7 @@ environment variables. - SYMCC_OUTPUT_DIR (default "/tmp/output"): This is the directory where SymCC will store new inputs (QSYM backend only). If you prefer to handle them - programatically, make your program call symcc_set_test_case_handler; the + programmatically, make your program call symcc_set_test_case_handler; the handler will be called instead of the default handler each time the backend generates a new input. @@ -79,12 +79,15 @@ environment variables. repeatedly (QSYM backend only). See the QSYM paper for details; highly recommended for fuzzing and enabled automatically by the fuzzing helper. -- SYMCC_AFL_COVERAGE_MAP (default empty): When set to the file name of an AFL - coverage map, load the map before executing the target program and use it to - skip solver queries for paths that have already been covered (QSYM backend - only). The map is updated in place, so beware of races when running multiple - instances of SymCC! The fuzzing helper uses this to remember the state of - exploration across multiple executions of the target program. +- SYMCC_AFL_COVERAGE_MAP (default empty): When set to the file name of an + AFL-style coverage map, load the map before executing the target program and + use it to skip solver queries for paths that have already been covered (QSYM + backend only). The map is updated in place, so beware of races when running + multiple instances of SymCC! The fuzzing helper uses this to remember the + state of exploration across multiple executions of the target program. + Warning: This setting has a misleading name - while the format of the map + follows (classic) AFL, the variable isn't meant to point at a map file that + AFL uses too! (Most people should stop reading here.) @@ -96,7 +99,7 @@ There is actually a third category of options: when compiling with SymCC, you can specify the location of its various components via environment variables. This is not necessary in most cases because the build system makes sure that all components know about each other; however, in some advanced setups you may need -to move files around after building them, and in that case you can use the +to move files around after building them, and in that case, you can use the variables documented below to communicate the new locations: - SYMCC_RUNTIME_DIR and SYMCC_RUNTIME32_DIR: The directory that contains the diff --git a/docs/Experiments.txt b/docs/Experiments.txt index a59a5232..6a284ed3 100644 --- a/docs/Experiments.txt +++ b/docs/Experiments.txt @@ -8,6 +8,8 @@ Here we document how to reproduce the experiments that we show in the paper are available on our website [1], which also provides our raw results. Feel free to reach out to us if you encounter problems with reproducing the benchmarks. +The datasets are also archived on figshare [10]. + In the paper, we describe two sets of experiments: we first benchmark SymCC on the CGC programs, then we run it on real-world software. @@ -37,7 +39,7 @@ regular 32-bit binaries built by cb-multios. The analysis of real-world software always follows the same procedure. Assuming you have exported CC=symcc, CXX=sym++ and SYMCC_NO_SYMBOLIC_INPUT=1, first -download the code, then build it using its own build system, finally unset +download the code, then build it using its own build system, and finally unset SYMCC_NO_SYMBOLIC_INPUT and analyze the program in concert with AFL (which requires building a second time for AFL, see docs/Fuzzing.txt). We used AFL 2.56b and built the targets with AFL_USE_ASan=1. Note that the fuzzing helper is @@ -50,7 +52,7 @@ OpenJPEG [4]: we used revision 1f1e9682, built with CMake as described in the files file1.jp2 and file8.jp2 [5]. libarchive [6]: we used revision 9ebb2484, built with CMake as described in the - poject's INSTALL (but adding "-DCMAKE_BUILD_TYPE=Release"), and analyzed + project's INSTALL (but adding "-DCMAKE_BUILD_TYPE=Release"), and analyzed "bin/bsdtar tf @@"; the corpus consisted of just a single dummy file containing the character "A". @@ -78,3 +80,4 @@ running it with AFL according to the QSYM authors' instructions [9]. [7] https://github.com/the-tcpdump-group/tcpdump.git [8] https://github.com/the-tcpdump-group/libpcap.git [9] https://github.com/sslab-gatech/qsym#run-hybrid-fuzzing-with-afl +[10] https://doi.org/10.6084/m9.figshare.24270709.v1 or https://figshare.com/articles/dataset/SymCC_evaluation_data/24270709 diff --git a/docs/Ideas.txt b/docs/Ideas.txt index a2720617..219236ae 100644 --- a/docs/Ideas.txt +++ b/docs/Ideas.txt @@ -35,7 +35,7 @@ because intermediate values are rarely computed without being used: typically, they end up being inputs to future computations, so we couldn't free the corresponding expressions anyway. A notable exception is the computation of values only for output - the expressions for such values could be freed after -the value is output, which would reduce memory consumption especially with +the value is output, which would reduce memory consumption, especially with output-heavy target programs. @@ -63,6 +63,6 @@ improvements they have made over AFL (e.g., AFL++ or Honggfuzz). Forking version Instead of working with a fuzzer, we could also implement forking and some -scheduling strategy ourselves. Georgia Tech has developed some OS-level +scheduling strategies ourselves. Georgia Tech has developed some OS-level primitives that could help to implement such a feature: https://github.com/sslab-gatech/perf-fuzz. diff --git a/docs/Libc.txt b/docs/Libc.txt index 7057ec0e..08ad920e 100644 --- a/docs/Libc.txt +++ b/docs/Libc.txt @@ -11,7 +11,7 @@ as the sanitizers and intercept calls to libc functions, wrapping them with symbolic handling. For example, the wrapper for "memset" obtains the symbolic expression for the value to be written in memory and pushes it to the shadow region of the destination memory. In the future, we may experiment with -compiling (parts of) libc to avoid the effort of manually defining wrappers. +compiling (parts of) the libc to avoid the effort of manually defining wrappers. Initially, we tried the interception mechanism that the LLVM sanitizers use, implemented in the compiler-rt library. The Linux version basically just defines @@ -25,11 +25,11 @@ replaces a given libc function, in the executable and in all libraries that it loads. However, our run-time support library is loaded into the same process and makes heavy use of libc, so we need the ability to use wrappers in one part of the program and concrete functions in another. This turned out to complicate the -compiler-rt based implementation so much that we eventually abandoned the +compiler-rt-based implementation so much that we eventually abandoned the approach. Function renaming provided a convenient alternative: we control all code that is -supposed to call wrappers rather the libc functions proper, so we just rename +supposed to call wrappers rather than the libc functions properly, so we just rename the targets of their calls. For example, a call to "memset" in the program under test is turned into a call to "memset_symbolized", which we can easily define as a regular function wrapping "memset". Calls from our run-time library, on the diff --git a/docs/Optimization.txt b/docs/Optimization.txt index 6b4f2c83..97efdbc2 100644 --- a/docs/Optimization.txt +++ b/docs/Optimization.txt @@ -7,7 +7,7 @@ A popular technique for experimenting with compiler passes is to produce bitcode with "clang -emit-llvm" and run the pass on the resulting bitcode with opt. Note that this approach does not mix well with optimization: simply running "opt -O3" on the instrumented bitcode yields inferior results. Why? In principle, the -instrumentation that adds symbolic-execution capabilities does not interfere +instrumentation that adds symbolic execution capabilities does not interfere with the compiler's regular optimization. However, while "opt -O3" runs the same middle-end optimizations as clang does internally, "clang -O3" performs additional analysis before invoking the middle end. In particular, type-based diff --git a/runtime/qsym_backend/qsym b/runtime/qsym_backend/qsym index 300300e8..fd5e08ea 160000 --- a/runtime/qsym_backend/qsym +++ b/runtime/qsym_backend/qsym @@ -1 +1 @@ -Subproject commit 300300e8f417e057f918eb8fd696290aec3e1a69 +Subproject commit fd5e08eaea764af9a2e2c8bfefead8b23c2a3c5f