From c0f90b9e4a251615288ed4b65ead81cea617da8c Mon Sep 17 00:00:00 2001 From: 2over12 Date: Wed, 17 Aug 2022 16:42:17 -0400 Subject: [PATCH] Add Context Structure to Affect State Dependent Liftings (#617) * add empty contexts * add include * make function const * add helper for uniform mappings * expose cache clearing for operand lifter * decoding context documentation: * move virtual inheritance down * remove unused var names * add type alias * remove underscores * make sure we have poetry * check version in CI * try specify python3 * newer poetry install script * fail fast * try use pythons pip * upgrade pip? * install directly * update in linux too --- .github/workflows/ci.yml | 17 ++++++--- include/remill/Arch/Arch.h | 23 +++++++++--- include/remill/Arch/ArchBase.h | 36 +++++++++++++------ include/remill/Arch/Context.h | 52 +++++++++++++++++++++++++++ include/remill/Arch/Instruction.h | 2 +- include/remill/BC/InstructionLifter.h | 4 ++- lib/Arch/AArch32/Arch.cpp | 3 +- lib/Arch/AArch32/Arch.h | 3 +- lib/Arch/AArch64/Arch.cpp | 5 +-- lib/Arch/Arch.cpp | 30 ++++++++++++---- lib/Arch/CMakeLists.txt | 2 ++ lib/Arch/Context.cpp | 36 +++++++++++++++++++ lib/Arch/Instruction.cpp | 2 +- lib/Arch/SPARC32/Arch.cpp | 5 +-- lib/Arch/SPARC64/Arch.cpp | 5 +-- lib/Arch/Sleigh/Arch.cpp | 26 ++++++++------ lib/Arch/Sleigh/Arch.h | 13 ++++--- lib/Arch/X86/Arch.cpp | 5 +-- lib/BC/TraceLifter.cpp | 9 +++-- test_runner_lib/TestRunner.cpp | 4 ++- 20 files changed, 221 insertions(+), 61 deletions(-) create mode 100644 include/remill/Arch/Context.h create mode 100644 lib/Arch/Context.cpp diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 33fffd41e..9975475b2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -37,6 +37,10 @@ jobs: with: fetch-depth: 0 - uses: ./.github/actions/prepare_git_user + - name: Get Poetry + shell: bash + run: | + python3 -m pip install poetry - name: Build with build script shell: bash run: | @@ -49,7 +53,6 @@ jobs: export VCPKG_ROOT=$(pwd)/../lifting-bits-downloads/vcpkg_${{ matrix.image.name }}-${{ matrix.image.tag }}_llvm-${{ matrix.llvm }}_amd64 export INSTALL_DIR=$(pwd)/remill-preset-install ./scripts/build-preset.sh release - - name: Install Python Test Deps shell: bash run: | @@ -109,6 +112,14 @@ jobs: with: fetch-depth: 0 - uses: ./.github/actions/prepare_git_user + - name: Get Poetry + shell: bash + run: | + python3 -m pip install poetry + - name: Install Python Test Deps + shell: bash + run: | + python3 -m pip install --user ./scripts/diff_tester_export_insns - name: Build with build script shell: bash run: | @@ -121,10 +132,6 @@ jobs: export VCPKG_ROOT=$(pwd)/../lifting-bits-downloads/vcpkg_${{ matrix.os}}_llvm-${{ matrix.llvm }}_xcode-13.0_amd64 export INSTALL_DIR=$(pwd)/remill-preset-install ./scripts/build-preset.sh release - - name: Install Python Test Deps - shell: bash - run: | - pip3 install --user ./scripts/diff_tester_export_insns - name: Run tests shell: bash working-directory: remill-build diff --git a/include/remill/Arch/Arch.h b/include/remill/Arch/Arch.h index 3e7c94874..d7e865c6b 100644 --- a/include/remill/Arch/Arch.h +++ b/include/remill/Arch/Arch.h @@ -30,6 +30,7 @@ #include #include #include +#include #pragma clang diagnostic pop @@ -170,6 +171,9 @@ class Arch { virtual ~Arch(void); + + virtual DecodingContext CreateInitialContext(void) const = 0; + // Factory method for loading the correct architecture class for a given // operating system and architecture class. static auto Get(llvm::LLVMContext &context, std::string_view os, @@ -281,14 +285,23 @@ class Arch { // walk up, one byte at a time, to `MaxInstructionSize(false)` // bytes being passed to the decoder, until you successfully decode // or ultimately fail. - virtual bool DecodeInstruction(uint64_t address, std::string_view instr_bytes, - Instruction &inst) const = 0; + + // The decoder takes contextual information in the form of a DecodingContext, making a copy to produce a ContextMap which is a function that maps + // a successor to a new context that updates the old context. + + using DecodingResult = std::optional; + + virtual DecodingResult + DecodeInstruction(uint64_t address, std::string_view instr_bytes, + Instruction &inst, DecodingContext context) const = 0; // Decode an instruction that is within a delay slot. - bool DecodeDelayedInstruction(uint64_t address, std::string_view instr_bytes, - Instruction &inst) const { + DecodingResult + DecodeDelayedInstruction(uint64_t address, std::string_view instr_bytes, + Instruction &inst, DecodingContext context) const { inst.in_delay_slot = true; - return this->DecodeInstruction(address, instr_bytes, inst); + return this->DecodeInstruction(address, instr_bytes, inst, + std::move(context)); } // Minimum alignment of an instruction for this particular architecture. diff --git a/include/remill/Arch/ArchBase.h b/include/remill/Arch/ArchBase.h index ec3438491..2772cc98f 100644 --- a/include/remill/Arch/ArchBase.h +++ b/include/remill/Arch/ArchBase.h @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include @@ -31,13 +32,9 @@ namespace remill { struct Register; + // Internal base architecture for all Remill-internal architectures. class ArchBase : public remill::Arch { - protected: - virtual bool ArchDecodeInstruction(uint64_t address, - std::string_view instr_bytes, - Instruction &inst) const = 0; - public: using ArchPtr = std::unique_ptr; @@ -73,12 +70,6 @@ class ArchBase : public remill::Arch { unsigned RegMdID(void) const final; - virtual bool DecodeInstruction(uint64_t address, std::string_view instr_bytes, - Instruction &inst) const override; - - OperandLifter::OpLifterPtr - DefaultLifter(const remill::IntrinsicTable &intrinsics) const override; - // Get the state pointer and various other types from the `llvm::LLVMContext` // associated with `module`. // @@ -114,4 +105,27 @@ class ArchBase : public remill::Arch { mutable std::unique_ptr instrinsics{nullptr}; }; +class DefaultContextAndLifter : virtual public remill::ArchBase { + public: + virtual DecodingContext CreateInitialContext(void) const override; + + virtual std::optional + DecodeInstruction(uint64_t address, std::string_view instr_bytes, + Instruction &inst, DecodingContext context) const override; + + + OperandLifter::OpLifterPtr + DefaultLifter(const remill::IntrinsicTable &intrinsics) const override; + + + DefaultContextAndLifter(llvm::LLVMContext *context_, OSName os_name_, + ArchName arch_name_); + + protected: + virtual bool ArchDecodeInstruction(uint64_t address, + std::string_view instr_bytes, + Instruction &inst) const = 0; +}; + + } // namespace remill diff --git a/include/remill/Arch/Context.h b/include/remill/Arch/Context.h new file mode 100644 index 000000000..8aadd577d --- /dev/null +++ b/include/remill/Arch/Context.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2022 Trail of Bits, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#pragma once + + +#include +#include +#include + +namespace remill { + +/// A decoding context is contextual information about the state of the program that affects decoding, ie. the thumb mode register on ARM +/// We allow clients to interpose on a context for resolution + +/// We return a function of successor -> DecodingContext. The decoder defines a relation on the +/// previous context and the successor address that produces a new decoding. +/// This definition of returned contexts allows us to cleanly handle situations like indirect jumps in arm +class DecodingContext { + + private: + std::unordered_map context_value; + + public: + using ContextMap = std::function; + + DecodingContext() = default; + + DecodingContext(std::unordered_map context_value); + + + uint64_t GetContextValue(const std::string &context_reg) const; + DecodingContext PutContextReg(std::string creg, uint64_t value) const; + + static ContextMap UniformContextMapping(DecodingContext cst); +}; + +} // namespace remill \ No newline at end of file diff --git a/include/remill/Arch/Instruction.h b/include/remill/Arch/Instruction.h index 2e04cddc0..192a44d81 100644 --- a/include/remill/Arch/Instruction.h +++ b/include/remill/Arch/Instruction.h @@ -352,7 +352,7 @@ class Instruction { Operand &EmplaceOperand(const Operand::Address &op); - const InstructionLifter::LifterPtr &GetLifter(); + const InstructionLifter::LifterPtr &GetLifter() const; void SetLifter(InstructionLifter::LifterPtr lifter); diff --git a/include/remill/BC/InstructionLifter.h b/include/remill/BC/InstructionLifter.h index 616cd40db..c34b4e975 100644 --- a/include/remill/BC/InstructionLifter.h +++ b/include/remill/BC/InstructionLifter.h @@ -67,6 +67,8 @@ class OperandLifter { std::string_view reg_name) const = 0; virtual llvm::Type *GetMemoryType() = 0; + + virtual void ClearCache(void) const = 0; }; // Wraps the process of lifting an instruction into a block. This resolves @@ -108,7 +110,7 @@ class InstructionLifter : public OperandLifter { std::string_view reg_name) const override final; // Clear out the cache of the current register values/addresses loaded. - void ClearCache(void) const; + void ClearCache(void) const override; virtual llvm::Type *GetMemoryType() override final; diff --git a/lib/Arch/AArch32/Arch.cpp b/lib/Arch/AArch32/Arch.cpp index f5dd91af5..1f42ad645 100644 --- a/lib/Arch/AArch32/Arch.cpp +++ b/lib/Arch/AArch32/Arch.cpp @@ -50,7 +50,8 @@ namespace remill { AArch32Arch::AArch32Arch(llvm::LLVMContext *context_, OSName os_name_, ArchName arch_name_) : ArchBase(context_, os_name_, arch_name_), - AArch32ArchBase(context_, os_name_, arch_name_) {} + AArch32ArchBase(context_, os_name_, arch_name_), + DefaultContextAndLifter(context_, os_name_, arch_name_) {} AArch32Arch::~AArch32Arch(void) {} diff --git a/lib/Arch/AArch32/Arch.h b/lib/Arch/AArch32/Arch.h index 9e364ad05..b6e7cdb48 100644 --- a/lib/Arch/AArch32/Arch.h +++ b/lib/Arch/AArch32/Arch.h @@ -19,7 +19,8 @@ #include namespace remill { -class AArch32Arch final : public AArch32ArchBase { +class AArch32Arch final : public AArch32ArchBase, + public DefaultContextAndLifter { public: AArch32Arch(llvm::LLVMContext *context_, OSName os_name_, ArchName arch_name_); diff --git a/lib/Arch/AArch64/Arch.cpp b/lib/Arch/AArch64/Arch.cpp index 265c532a5..34e0760ac 100644 --- a/lib/Arch/AArch64/Arch.cpp +++ b/lib/Arch/AArch64/Arch.cpp @@ -106,7 +106,7 @@ Instruction::Category InstCategory(const aarch64::InstData &inst) { } } -class AArch64Arch final : public ArchBase { +class AArch64Arch final : public DefaultContextAndLifter { public: AArch64Arch(llvm::LLVMContext *context_, OSName os_name_, ArchName arch_name_); @@ -148,7 +148,8 @@ class AArch64Arch final : public ArchBase { AArch64Arch::AArch64Arch(llvm::LLVMContext *context_, OSName os_name_, ArchName arch_name_) - : ArchBase(context_, os_name_, arch_name_) {} + : ArchBase(context_, os_name_, arch_name_), + DefaultContextAndLifter(context_, os_name_, arch_name_) {} AArch64Arch::~AArch64Arch(void) {} diff --git a/lib/Arch/Arch.cpp b/lib/Arch/Arch.cpp index 8df06c8b0..d49889ba1 100644 --- a/lib/Arch/Arch.cpp +++ b/lib/Arch/Arch.cpp @@ -850,16 +850,34 @@ const IntrinsicTable *ArchBase::GetInstrinsicTable(void) const { return this->instrinsics.get(); } -OperandLifter::OpLifterPtr -ArchBase::DefaultLifter(const remill::IntrinsicTable &intrinsics) const { - return std::make_shared(this, intrinsics); + +DecodingContext DefaultContextAndLifter::CreateInitialContext(void) const { + return DecodingContext(); } -bool ArchBase::DecodeInstruction(uint64_t address, std::string_view instr_bytes, - Instruction &inst) const { +Arch::DecodingResult DefaultContextAndLifter::DecodeInstruction( + uint64_t address, std::string_view instr_bytes, Instruction &inst, + DecodingContext context) const { inst.SetLifter(std::make_unique( this, this->GetInstrinsicTable())); - return this->ArchDecodeInstruction(address, instr_bytes, inst); + if (this->ArchDecodeInstruction(address, instr_bytes, inst)) { + return [](uint64_t) -> DecodingContext { return DecodingContext(); }; + } + + return std::nullopt; } + +OperandLifter::OpLifterPtr DefaultContextAndLifter::DefaultLifter( + const remill::IntrinsicTable &intrinsics) const { + return std::make_shared(this, intrinsics); +} + + +DefaultContextAndLifter::DefaultContextAndLifter(llvm::LLVMContext *context_, + OSName os_name_, + ArchName arch_name_) + : ArchBase(context_, os_name_, arch_name_) {} + + } // namespace remill diff --git a/lib/Arch/CMakeLists.txt b/lib/Arch/CMakeLists.txt index e10ce79cf..89cd741f8 100644 --- a/lib/Arch/CMakeLists.txt +++ b/lib/Arch/CMakeLists.txt @@ -17,10 +17,12 @@ add_library(remill_arch STATIC "${REMILL_INCLUDE_DIR}/remill/Arch/Instruction.h" "${REMILL_INCLUDE_DIR}/remill/Arch/Name.h" "${REMILL_INCLUDE_DIR}/remill/Arch/ArchBase.h" + "${REMILL_INCLUDE_DIR}/remill/Arch/Context.h" Arch.cpp BitManipulation.h Instruction.cpp + Context.cpp Name.cpp ) diff --git a/lib/Arch/Context.cpp b/lib/Arch/Context.cpp new file mode 100644 index 000000000..64fd7999e --- /dev/null +++ b/lib/Arch/Context.cpp @@ -0,0 +1,36 @@ + +#include +#include + +namespace remill { + +DecodingContext::DecodingContext( + std::unordered_map context_value) + : context_value(std::move(context_value)) {} + + +uint64_t +DecodingContext::GetContextValue(const std::string &context_reg) const { + + if (auto res = this->context_value.find(context_reg); + res != this->context_value.end()) { + return res->second; + } + + LOG(FATAL) << "No context value for " << context_reg + << " but it is required for decoding"; +} +DecodingContext DecodingContext::PutContextReg(std::string creg, + uint64_t value) const { + std::unordered_map new_value(this->context_value); + new_value.emplace(creg, value); + return DecodingContext(std::move(new_value)); +} + +DecodingContext::ContextMap +DecodingContext::UniformContextMapping(DecodingContext cst) { + return [cst = std::move(cst)](uint64_t) -> DecodingContext { return cst; }; +} + + +} // namespace remill \ No newline at end of file diff --git a/lib/Arch/Instruction.cpp b/lib/Arch/Instruction.cpp index e4ef70604..7a51ec23f 100644 --- a/lib/Arch/Instruction.cpp +++ b/lib/Arch/Instruction.cpp @@ -793,7 +793,7 @@ std::string Instruction::Serialize(void) const { return ss.str(); } -const InstructionLifter::LifterPtr &Instruction::GetLifter() { +const InstructionLifter::LifterPtr &Instruction::GetLifter() const { return this->lifter; } diff --git a/lib/Arch/SPARC32/Arch.cpp b/lib/Arch/SPARC32/Arch.cpp index 87b212e32..f318b3aae 100644 --- a/lib/Arch/SPARC32/Arch.cpp +++ b/lib/Arch/SPARC32/Arch.cpp @@ -130,10 +130,11 @@ void AddImmop(Instruction &inst, uint64_t imm, unsigned size, bool is_signed) { } -class SPARC32Arch final : public ArchBase { +class SPARC32Arch final : public DefaultContextAndLifter { public: SPARC32Arch(llvm::LLVMContext *context_, OSName os_name_, ArchName arch_name_) - : ArchBase(context_, os_name_, arch_name_) {} + : ArchBase(context_, os_name_, arch_name_), + DefaultContextAndLifter(context_, os_name_, arch_name_) {} virtual ~SPARC32Arch(void) = default; diff --git a/lib/Arch/SPARC64/Arch.cpp b/lib/Arch/SPARC64/Arch.cpp index 267439011..88a8ac82f 100644 --- a/lib/Arch/SPARC64/Arch.cpp +++ b/lib/Arch/SPARC64/Arch.cpp @@ -38,10 +38,11 @@ static const std::string_view kSPRegName = "sp"; static const std::string_view kPCRegName = "pc"; } // namespace -class SPARC64Arch final : public ArchBase { +class SPARC64Arch final : public DefaultContextAndLifter { public: SPARC64Arch(llvm::LLVMContext *context_, OSName os_name_, ArchName arch_name_) - : ArchBase(context_, os_name_, arch_name_) {} + : ArchBase(context_, os_name_, arch_name_), + DefaultContextAndLifter(context_, os_name_, arch_name_) {} virtual ~SPARC64Arch(void) = default; diff --git a/lib/Arch/Sleigh/Arch.cpp b/lib/Arch/Sleigh/Arch.cpp index 4d6de3b9a..10635f926 100644 --- a/lib/Arch/Sleigh/Arch.cpp +++ b/lib/Arch/Sleigh/Arch.cpp @@ -366,24 +366,30 @@ std::string CustomLoadImage::getArchType(void) const { void CustomLoadImage::adjustVma(long) {} -bool SleighArch::DecodeInstruction(uint64_t address, - std::string_view instr_bytes, - Instruction &inst) const { +SleighArch::DecodingResult +SleighArch::DecodeInstruction(uint64_t address, std::string_view instr_bytes, + Instruction &inst, + DecodingContext context) const { inst.SetLifter( std::make_shared(this, *this->GetInstrinsicTable())); assert(inst.GetLifter() != nullptr); - return this->ArchDecodeInstruction(address, instr_bytes, inst); + if (const_cast(this)->DecodeInstructionImpl( + address, instr_bytes, inst)) { + return [this](uint64_t) -> DecodingContext { + return this->CreateInitialContext(); + }; + } + + return std::nullopt; } -bool SleighArch::ArchDecodeInstruction(uint64_t address, - std::string_view instr_bytes, - Instruction &inst) const { - // TODO(Ian): Since we dont control sleigh we probably need DecodeInsn to be non const? - return const_cast(this)->DecodeInstructionImpl( - address, instr_bytes, inst); + +DecodingContext SleighArch::CreateInitialContext(void) const { + return DecodingContext(); } + SleighArch::SleighArch(llvm::LLVMContext *context_, OSName os_name_, ArchName arch_name_, std::string sla_name, std::string pspec_name) diff --git a/lib/Arch/Sleigh/Arch.h b/lib/Arch/Sleigh/Arch.h index 4f01e510f..cde78c666 100644 --- a/lib/Arch/Sleigh/Arch.h +++ b/lib/Arch/Sleigh/Arch.h @@ -196,16 +196,15 @@ class SleighArch : virtual public ArchBase { public: - virtual bool DecodeInstruction(uint64_t address, std::string_view instr_bytes, - Instruction &inst) const override; + OperandLifter::OpLifterPtr + DefaultLifter(const remill::IntrinsicTable &intrinsics) const override; - virtual bool ArchDecodeInstruction(uint64_t address, - std::string_view instr_bytes, - Instruction &inst) const override; + virtual DecodingContext CreateInitialContext(void) const override; - OperandLifter::OpLifterPtr - DefaultLifter(const remill::IntrinsicTable &intrinsics) const override; + virtual std::optional + DecodeInstruction(uint64_t address, std::string_view instr_bytes, + Instruction &inst, DecodingContext context) const override; // Arch specific preperation diff --git a/lib/Arch/X86/Arch.cpp b/lib/Arch/X86/Arch.cpp index d65b1b3c4..9b677fea3 100644 --- a/lib/Arch/X86/Arch.cpp +++ b/lib/Arch/X86/Arch.cpp @@ -780,7 +780,7 @@ static void DecodeOperand(Instruction &inst, const xed_decoded_inst_t *xedd, } } -class X86Arch final : public X86ArchBase { +class X86Arch final : public X86ArchBase, public DefaultContextAndLifter { public: X86Arch(llvm::LLVMContext *context_, OSName os_name_, ArchName arch_name_); @@ -798,7 +798,8 @@ class X86Arch final : public X86ArchBase { X86Arch::X86Arch(llvm::LLVMContext *context_, OSName os_name_, ArchName arch_name_) : ArchBase(context_, os_name_, arch_name_), - X86ArchBase(context_, os_name_, arch_name_) { + X86ArchBase(context_, os_name_, arch_name_), + DefaultContextAndLifter(context_, os_name_, arch_name_) { static bool xed_is_initialized = false; if (!xed_is_initialized) { diff --git a/lib/BC/TraceLifter.cpp b/lib/BC/TraceLifter.cpp index cfe7dbfee..34ca16bc1 100644 --- a/lib/BC/TraceLifter.cpp +++ b/lib/BC/TraceLifter.cpp @@ -345,7 +345,9 @@ bool TraceLifter::Impl::Lift( inst.Reset(); - (void) arch->DecodeInstruction(inst_addr, inst_bytes, inst); + // TODO(Ian): not passing context around in trace lifter + std::ignore = arch->DecodeInstruction(inst_addr, inst_bytes, inst, + this->arch->CreateInitialContext()); auto lift_status = inst.GetLifter()->LiftIntoBlock(inst, block, state_ptr); @@ -359,8 +361,9 @@ bool TraceLifter::Impl::Lift( if (try_delay) { delayed_inst.Reset(); if (!ReadInstructionBytes(inst.delayed_pc) || - !arch->DecodeDelayedInstruction(inst.delayed_pc, inst_bytes, - delayed_inst)) { + !arch->DecodeDelayedInstruction( + inst.delayed_pc, inst_bytes, delayed_inst, + this->arch->CreateInitialContext())) { LOG(ERROR) << "Couldn't read delayed inst " << delayed_inst.Serialize(); AddTerminatingTailCall(block, intrinsics->error, *intrinsics); diff --git a/test_runner_lib/TestRunner.cpp b/test_runner_lib/TestRunner.cpp index ee7fd48e1..ab3b63e61 100644 --- a/test_runner_lib/TestRunner.cpp +++ b/test_runner_lib/TestRunner.cpp @@ -240,7 +240,9 @@ LiftingTester::LiftInstructionFunction(std::string_view fname, std::string_view bytes, uint64_t address) { remill::Instruction insn; - if (!this->arch->DecodeInstruction(address, bytes, insn)) { + // This works for now since each arch has an initial context that represents the arch correctly. + if (!this->arch->DecodeInstruction(address, bytes, insn, + this->arch->CreateInitialContext())) { LOG(ERROR) << "Failed decode"; return std::nullopt; }