Moved vset stalling to fetch

riscv-software-src · May 21, 2024 · daaa41f · daaa41f
1 parent c098650
commit daaa41f
Show file tree

Hide file tree

Showing 9 changed files with 155 additions and 97 deletions.
diff --git a/arches/big_core.yaml b/arches/big_core.yaml
@@ -7,7 +7,7 @@ include: medium_core.yaml
 
 top.cpu.core0:
   fetch.params.num_to_fetch:   8
-  decode.params.num_to_decode: 8
+  decode.params.num_to_decode: 3
   rename.params.num_to_rename: 8
   rename.params.num_integer_renames: 64
   rename.params.num_float_renames: 64

diff --git a/core/CPUTopology.cpp b/core/CPUTopology.cpp
@@ -381,8 +381,10 @@ void olympia::CoreTopologySimple::bindTree(sparta::RootTreeNode* root_node)
                 pipe_target_end = stoi(iq[1]);
             }
             pipe_target_end++;
-            const std::string vset_in =
+            const std::string vset_in_decode =
                 core_node + ".decode." + "ports.in_vset_inst";
+            const std::string vset_in_fetch =
+                core_node + ".fetch." + "ports.in_vset_inst";
             for (int pipe_idx = pipe_target_start; pipe_idx < pipe_target_end; ++pipe_idx)
             {
                 std::string unit_name = "exe" + std::to_string(pipe_idx);
@@ -403,7 +405,8 @@ void olympia::CoreTopologySimple::bindTree(sparta::RootTreeNode* root_node)
                         // only bind execute pipe -> decode port for an issue queue if it has a vset pipe
                         const std::string exe_vset_out =
                             core_node + ".execute." + unit_name + ".ports.out_vset";
-                        bind_ports(vset_in, exe_vset_out);
+                        bind_ports(vset_in_decode, exe_vset_out);
+                        bind_ports(vset_in_fetch, exe_vset_out);
                         break; // break after because there should only be one vset per issue queue
                     }
                 }

diff --git a/core/CoreTypes.hpp b/core/CoreTypes.hpp
@@ -18,9 +18,6 @@ namespace olympia::core_types
         N_REGFILES = RF_INVALID
     };
 
-    // std::vector<core_types::RegFile> reg_files = {core_types::RF_INTEGER, core_types::RF_FLOAT,
-    // core_types::RF_VECTOR};
-
     static inline const char* const regfile_names[] = {"integer", "float", "vector"};
 
     static inline const char* const issue_queue_types[] = {"alu", "fpu", "br", "vint", "vset"};

diff --git a/core/Decode.cpp b/core/Decode.cpp
@@ -19,6 +19,7 @@ namespace olympia
         sparta::Unit(node),
 
         fetch_queue_("FetchQueue", p->fetch_queue_size, node->getClock(), &unit_stat_set_),
+        uop_queue_("UOpQueue", p->uop_queue_size, node->getClock(), &unit_stat_set_),
         fusion_num_fuse_instructions_(&unit_stat_set_, "fusion_num_fuse_instructions",
                                       "The number of custom instructions created by fusion",
                                       sparta::Counter::COUNT_NORMAL),
@@ -121,7 +122,6 @@ namespace olympia
         VCSRs_.lmul = inst->getLMUL();
         VCSRs_.sew = inst->getSEW();
         VCSRs_.vl = inst->getVL();
-        waiting_on_vset_ = false;
         // schedule decode, because we've been stalled on vset
         ev_decode_insts_event_.schedule(sparta::Clock::Cycle(0));
     }
@@ -137,7 +137,7 @@ namespace olympia
     // Decode instructions
     void Decode::decodeInsts_()
     {
-        uint32_t num_decode = std::min(uop_queue_credits_, fetch_queue_.size());
+        uint32_t num_decode = std::min(uop_queue_credits_, fetch_queue_.size() + uop_queue_.size());
         num_decode = std::min(num_decode, num_to_decode_);
 
         // buffer to maximize the chances of a group match limited
@@ -162,98 +162,95 @@ namespace olympia
             // Send instructions on their way to rename
             for (uint32_t i = 0; i < num_decode; ++i)
             {
-                const auto & inst = fetch_queue_.read(0);
-                // if we're waiting on a vset, but it's a scalar instruction
-                // we can process all scalars after the vset until we reach a vset the decode queue
-                if ((!waiting_on_vset_) || (waiting_on_vset_ && !inst->isVector()))
-                {
-                    // we only need to stall for vset when it's
-                    // vsetvl or a vset{i}vl{i} that has a vl that is not the default
-                    if(inst->isVset()){
-
-                        if(inst->getSourceOpInfoList()[0].field_value != 0){
-                            // vl is being set by register, need to block
-                            waiting_on_vset_ = true;
-                        }
-                        else if(inst->getSourceOpInfoList()[0].field_value == 0 && inst->getDestOpInfoList()[0].field_value != 0){
-                            // set vl to vlmax, no need to block
-                            VCSRs_.vl = Inst::VLMAX;
-                        }
-                    }
-                    if (inst->getMnemonic() == "vsetvl")
-                    {
-                        // vsetvl depends on register values for VTYPE, need to wait till execution
-                        waiting_on_vset_ = true;
+                if(uop_queue_.size() > 0){
+                    const auto & inst = uop_queue_.read(0);
+                    insts->emplace_back(inst);
+                    inst->setStatus(Inst::Status::DECODED);
+                    ILOG("From UOp Queue Decoded: " << inst);
+                    uop_queue_.pop();
+                }
+                else{
+                    const auto & inst = fetch_queue_.read(0);
+                    // if we're waiting on a vset, but it's a scalar instruction
+                    // we can process all scalars after the vset until we reach a vset the decode queue
+                    if(inst->isVset() && inst->getSourceOpInfoList()[0].field_value == 0 && inst->getDestOpInfoList()[0].field_value != 0){
+                        // set vl to vlmax, no need to block
+                        VCSRs_.vl = Inst::VLMAX;
                     }
-                    else if (inst->isVector())
+                    if (!inst->isVset() && inst->isVector())
                     {
                         // set LMUL, VSET, VL
                         inst->setVCSRs(VCSRs_);
                     }
-                    if (VCSRs_.lmul > 1)
+                    if (inst->getLMUL() > 1 && !inst->isVset())
                     {
                         // lmul > 1, fracture instruction into UOps
                         inst->setUOp(true); // mark instruction to denote it has UOPs
-                        if (uop_queue_credits_ - i > VCSRs_.lmul)
+                        // turn this into a state machine
+                        // send them out based on credit
+                        // state indicating if we're decoding as normal or draining a UOp Queue
+                        ILOG("Inst: " << inst << " is being split into " << VCSRs_.lmul << " UOPs");
+                        // we can process the lmul, we subtract from uop_queue_credits_
+                        // because num_decode is min of both fetch queue and uop_queue_credits_
+                        // which doesn't factor in the uop amount per instruction
+                        insts->emplace_back(inst);
+                        inst->setStatus(Inst::Status::DECODED);
+                        fetch_queue_.pop();
+                        for (uint32_t j = 1; j < VCSRs_.lmul; ++j)
                         {
-                            ILOG("Inst: " << inst << " is being split into " << VCSRs_.lmul << " UOPs");
-                            // we can process the lmul, we subtract from uop_queue_credits_
-                            // because num_decode is min of both fetch queue and uop_queue_credits_
-                            // which doesn't factor in the uop amount per instruction
-                            for (uint32_t j = 1; j < VCSRs_.lmul; ++j)
+                            i++;
+                            // we create lmul - 1 instructions, because the original instruction
+                            // will also be executed, so we start creating UOPs at vector
+                            // registers + 1 until LMUL
+                            MavisType* mavis_facade_ = getMavis(getContainer());
+                            const std::string mnemonic = inst->getMnemonic();
+                            auto srcs = inst->getSourceOpInfoList();
+                            // determine different modes of agnostic vs undistrubed
+                            // parameter in simulator for setting ^
+                            for (auto & src : srcs)
+                            {
+                                src.field_value += j;
+                            }
+                            auto dests = inst->getDestOpInfoList();
+                            for (auto & dest : dests)
                             {
-                                // we create lmul - 1 instructions, because the original instruction
-                                // will also be executed, so we start creating UOPs at vector
-                                // registers + 1 until LMUL
-                                MavisType* mavis_facade_ = getMavis(getContainer());
-                                const std::string mnemonic = inst->getMnemonic();
-                                auto srcs = inst->getSourceOpInfoList();
-                                for (auto & src : srcs)
-                                {
-                                    src.field_value += j;
-                                }
-                                auto dests = inst->getDestOpInfoList();
-                                for (auto & dest : dests)
-                                {
-                                    dest.field_value += j;
-                                }
-                                const auto imm = inst->getImmediate();
-                                mavis::ExtractorDirectOpInfoList ex_info(mnemonic, srcs, dests,
-                                                                         imm);
-                                InstPtr new_inst =
-                                    mavis_facade_->makeInstDirectly(ex_info, getClock());
-                                InstPtr inst_uop_ptr(new Inst(*new_inst));
-                                inst_uop_ptr->setVCSRs(VCSRs_);
-                                inst_uop_ptr->setUOpID(j);
-                                inst->appendUOp(inst_uop_ptr);
+                                dest.field_value += j;
+                            }
+                            const auto imm = inst->getImmediate();
+                            mavis::ExtractorDirectOpInfoList ex_info(mnemonic, srcs, dests,
+                                                                        imm);
+                            InstPtr new_inst =
+                                mavis_facade_->makeInstDirectly(ex_info, getClock());
+                            // setting UOp instructions to have the same UID and PID as parent instruction
+                            new_inst->setUniqueID(inst->getUniqueID());
+                            new_inst->setProgramID(inst->getProgramID());
+                            InstPtr inst_uop_ptr(new Inst(*new_inst));
+                            inst_uop_ptr->setVCSRs(VCSRs_);
+                            inst_uop_ptr->setUOpID(j);
+                            inst->appendUOp(inst_uop_ptr);
+                            if(i < num_decode){
                                 insts->emplace_back(inst_uop_ptr);
                                 inst_uop_ptr->setStatus(Inst::Status::DECODED);
                             }
-                            i += VCSRs_.lmul; // increment decode number based on lmul
-                        }
-                        else
-                        {
-                            // can't decode this instruction, not enough decode credits
-                            // to support cracking into multiple uops
-                            break;
+                            else{
+                                ILOG("Not enough decode credits to process UOp, appending to uop_queue_ " << inst_uop_ptr);
+                                uop_queue_.push(inst_uop_ptr);
+                            }
                         }
                     }
-                    insts->emplace_back(inst);
-                    inst->setStatus(Inst::Status::DECODED);
+                    else{
+                        insts->emplace_back(inst);
+                        inst->setStatus(Inst::Status::DECODED);
 
-                    if (fusion_enable_)
-                    {
-                        uids.push_back(inst->getMavisUid());
-                    }
+                        if (fusion_enable_)
+                        {
+                            uids.push_back(inst->getMavisUid());
+                        }
 
-                    ILOG("Decoded: " << inst);
+                        ILOG("Decoded: " << inst);
 
-                    fetch_queue_.pop();
-                }
-                else
-                {
-                    ILOG("Stalling due to waiting on vset: " << inst);
-                    break;
+                        fetch_queue_.pop();
+                    }
                 }
             }
             if (fusion_enable_)

diff --git a/core/Decode.hpp b/core/Decode.hpp
@@ -75,6 +75,9 @@ namespace olympia
             //! \brief depth of the input instruction buffer
             PARAMETER(uint32_t, fetch_queue_size, 10, "Size of the fetch queue")
 
+            //! \brief depth of UOp Queue
+            PARAMETER(uint32_t, uop_queue_size, 8, "Size of the UOp queue")
+
             //! \brief enable fusion operations
             //!
             //! master enable, when false fusion_* parmeters have no effect
@@ -144,6 +147,7 @@ namespace olympia
       private:
         // The internal instruction queue
         InstQueue fetch_queue_;
+        InstQueue uop_queue_;
 
         // Port listening to the fetch queue appends - Note the 1 cycle delay
         sparta::DataInPort<InstGroupPtr> fetch_queue_write_in_{&unit_port_set_,
@@ -333,7 +337,6 @@ namespace olympia
         void handleFlush_(const FlushManager::FlushingCriteria & criteria);
 
         uint32_t uop_queue_credits_ = 0;
-        bool waiting_on_vset_ = false;
     };
 
     //! \brief the fusion functor/function objects

diff --git a/core/ExecutePipe.cpp b/core/ExecutePipe.cpp
@@ -43,7 +43,7 @@ namespace olympia
             cpu_node = getContainer()->getRoot();
         }
         for (uint32_t rf = 0; rf < core_types::RegFile::N_REGFILES;
-             ++rf) // for (const auto rf : core_types::reg_files)
+             ++rf)
         {
             // alu0, alu1 name is based on exe names, point to issue_queue name instead
             scoreboard_views_[rf].reset(
@@ -77,7 +77,7 @@ namespace olympia
             {
                 if (num_passes_needed_ == 0)
                 {
-                    uint32_t num_passes = std::ceil((ex_inst->getVL()/ex_inst->getSEW()) / valu_adder_num_);
+                    const uint32_t num_passes = std::ceil((ex_inst->getVL()/ex_inst->getSEW()) / valu_adder_num_);
                     if (num_passes > 1)
                     {
                         // only care about cases with multiple passes
@@ -120,6 +120,7 @@ namespace olympia
         {
             if (num_passes_needed_ != 0)
             {
+                // reseting counters once vector instruction needing more than 1 pass
                 curr_num_pass_ = 0;
                 num_passes_needed_ = 0;
             }

diff --git a/core/Fetch.cpp b/core/Fetch.cpp
@@ -22,7 +22,8 @@ namespace olympia
         sparta::Unit(node),
         num_insts_to_fetch_(p->num_to_fetch),
         skip_nonuser_mode_(p->skip_nonuser_mode),
-        my_clk_(getClock())
+        my_clk_(getClock()),
+        fetched_queue_("FetchedQueue", p->fetched_queue_size, node->getClock(), &unit_stat_set_)
     {
         in_fetch_queue_credits_.
             registerConsumerHandler(CREATE_SPARTA_HANDLER_WITH_DATA(Fetch, receiveFetchQueueCredits_, uint32_t));
@@ -32,6 +33,8 @@ namespace olympia
 
         fetch_inst_event_.reset(new sparta::SingleCycleUniqueEvent<>(&unit_event_set_, "fetch_random",
                                                                      CREATE_SPARTA_HANDLER(Fetch, fetchInstruction_)));
+        in_vset_inst_.registerConsumerHandler(
+            CREATE_SPARTA_HANDLER_WITH_DATA(Fetch, process_vset_, InstPtr));
         // Schedule a single event to start reading from a trace file
         sparta::StartupEvent(node, CREATE_SPARTA_HANDLER(Fetch, initialize_));
 
@@ -51,7 +54,7 @@ namespace olympia
 
         fetch_inst_event_->schedule(1);
     }
-
+    
     void Fetch::fetchInstruction_()
     {
         const uint32_t upper = std::min(credits_inst_queue_, num_insts_to_fetch_);
@@ -62,16 +65,48 @@ namespace olympia
         InstGroupPtr insts_to_send = sparta::allocate_sparta_shared_pointer<InstGroup>(instgroup_allocator);
         for(uint32_t i = 0; i < upper; ++i)
         {
-            InstPtr ex_inst = inst_generator_->getNextInst(my_clk_);
-            if(SPARTA_EXPECT_TRUE(nullptr != ex_inst))
-            {
-                ex_inst->setSpeculative(speculative_path_);
-                insts_to_send->emplace_back(ex_inst);
-
-                ILOG("Sending: " << ex_inst << " down the pipe");
-            }
-            else {
-                break;
+            if(!waiting_on_vset_ || fetched_queue_.size() < fetched_queue_.capacity()){
+                // Note -> should we change this to block after the first vector instruction after
+                // a vset is detected
+                InstPtr ex_inst = nullptr;
+                if(fetched_queue_.size() > 0){
+                    // if we have already fetched instructions, we should process those first
+                    ex_inst = fetched_queue_.read(0);
+                    fetched_queue_.pop();
+                }
+                else{
+                    ex_inst = inst_generator_->getNextInst(my_clk_);
+                }
+                if(SPARTA_EXPECT_TRUE(nullptr != ex_inst)){
+                    if ((!waiting_on_vset_) || (waiting_on_vset_ && !ex_inst->isVector()))
+                    {
+                        // we only need to stall for vset when it's
+                        // vsetvl or a vset{i}vl{i} that has a vl that is not the default
+                        // any imms can be decoded here and we don't have to stall vset
+                        // check if indirect vset
+                        // move stalling check to fetch, fetch has to break it up, once one direct vset is detected
+                        if(ex_inst->isVset()){
+                            if(ex_inst->getSourceOpInfoList()[0].field_value != 0 || ex_inst->getOpCodeInfo()->getInstructionUniqueID() == 315){
+                                // vl is being set by register, need to block
+                                // vsetvl in mavis -> give it a mavis id, if mavisid == vsetvl number
+                                waiting_on_vset_ = true;
+                            }
+                        }
+                        ex_inst->setSpeculative(speculative_path_);
+                        insts_to_send->emplace_back(ex_inst);
+
+                        ILOG("Sending: " << ex_inst << " down the pipe");
+                    }
+                    else{
+                        // store fetched instruction in queue
+                        fetched_queue_.push((ex_inst));
+                        break;
+                    }
+                }
+                else{
+                    ILOG("Stalling due to waiting on vset");
+                    break;
+                }
             }
         }
 
@@ -95,6 +130,13 @@ namespace olympia
         }
     }
 
+    void Fetch::process_vset_(const InstPtr & inst)
+    {
+        waiting_on_vset_ = false;
+        ILOG("Recieved VSET from ExecutePipe, resuming fetching");
+        // schedule fetch, because we've been stalled on vset
+        fetch_inst_event_->schedule(sparta::Clock::Cycle(0));
+    }
     // Called when decode has room
     void Fetch::receiveFetchQueueCredits_(const uint32_t & dat) {
         credits_inst_queue_ += dat;