Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Increase performance of SWD and JTAG bitbanging #1688

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 54 additions & 33 deletions src/platforms/common/jtagtap.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,13 @@
jtag_proc_s jtag_proc;

static void jtagtap_reset(void);
static void jtagtap_tms_seq(uint32_t tms_states, size_t ticks);
static void jtagtap_tdi_tdo_seq(uint8_t *data_out, bool final_tms, const uint8_t *data_in, size_t clock_cycles);
static void jtagtap_tdi_seq(bool final_tms, const uint8_t *data_in, size_t clock_cycles);
static bool jtagtap_next(bool tms, bool tdi);
static void jtagtap_cycle(bool tms, bool tdi, size_t clock_cycles);
static inline void platform_delay_busy(const uint32_t loops) __attribute__((always_inline));
static void jtagtap_tms_seq(uint32_t tms_states, size_t clock_cycles) __attribute__((optimize(3)));
static void jtagtap_tdi_tdo_seq(uint8_t *data_out, bool final_tms, const uint8_t *data_in, size_t clock_cycles)
__attribute__((optimize(3)));
static void jtagtap_tdi_seq(bool final_tms, const uint8_t *data_in, size_t clock_cycles) __attribute__((optimize(3)));
static bool jtagtap_next(bool tms, bool tdi) __attribute__((optimize(3)));
static void jtagtap_cycle(bool tms, bool tdi, size_t clock_cycles) __attribute__((optimize(3)));

void jtagtap_init(void)
{
Expand All @@ -51,41 +53,51 @@ void jtagtap_init(void)
jtag_proc.tap_idle_cycles = 1;

/* Ensure we're in JTAG mode */
for (size_t i = 0; i <= 50U; ++i)
jtagtap_next(true, false); /* 50 + 1 idle cycles for SWD reset */
jtagtap_tms_seq(0xe73cU, 16U); /* SWD to JTAG sequence */
jtagtap_cycle(true, false, 51U); /* 50 + 1 idle cycles for SWD reset */
jtagtap_tms_seq(0xe73cU, 16U); /* SWD to JTAG sequence */
}

static void jtagtap_reset(void)
{
#ifdef TRST_PORT
if (platform_hwversion() == 0) {
gpio_clear(TRST_PORT, TRST_PIN);
for (volatile size_t i = 0; i < 10000U; i++)
continue;
/* Hold low for approximately 1.5 ms */
platform_delay(1U); /* Requires SysTick interrupt to be unblocked */
gpio_set(TRST_PORT, TRST_PIN);
}
#endif
jtagtap_soft_reset();
}

/* Busy-looping delay snippet for GPIO bitbanging (rely on inlining) */
static inline void platform_delay_busy(const uint32_t loops)
{
for (register uint32_t counter = loops; --counter > 0U;)
__asm__("");
}

static bool jtagtap_next_clk_delay()
{
gpio_set(TCK_PORT, TCK_PIN);
for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter)
continue;
platform_delay_busy(target_clk_divider);
const uint16_t result = gpio_get(TDO_PORT, TDO_PIN);
gpio_clear(TCK_PORT, TCK_PIN);
for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter)
continue;
platform_delay_busy(target_clk_divider);
return result != 0;
}

static bool jtagtap_next_no_delay() __attribute__((optimize(3)));

static bool jtagtap_next_no_delay()
{
gpio_set(TCK_PORT, TCK_PIN);
/* Stretch the clock high time */
__asm__("nop" ::: "memory");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please include a comment on what this is doing - we don't want to assume that the reader has seen the other uses of this and knows what's going on. Is there any particular reason for the inserting of a no-op rather than only a reordering barrier?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Annotated for the readers. This routine is too fast in comparison to others, because it has nothing to do besides toggle the jTCK pin, and it certainly does not move data into (or out of) BMP. I may even want to add more nops or something similar to tone down this one.

const uint16_t result = gpio_get(TDO_PORT, TDO_PIN);
gpio_clear(TCK_PORT, TCK_PIN);
/* Stretch the clock low time */
__asm__("nop" ::: "memory");
return result != 0;
}

Expand All @@ -105,15 +117,16 @@ static void jtagtap_tms_seq_clk_delay(uint32_t tms_states, const size_t clock_cy
const bool state = tms_states & 1U;
gpio_set_val(TMS_PORT, TMS_PIN, state);
gpio_set(TCK_PORT, TCK_PIN);
for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter)
continue;
platform_delay_busy(target_clk_divider);
tms_states >>= 1U;
gpio_clear(TCK_PORT, TCK_PIN);
for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter)
continue;
platform_delay_busy(target_clk_divider);
}
}

static void jtagtap_tms_seq_no_delay(uint32_t tms_states, const size_t clock_cycles)
__attribute__((noinline, optimize(3)));

static void jtagtap_tms_seq_no_delay(uint32_t tms_states, const size_t clock_cycles)
{
bool state = tms_states & 1U;
Expand All @@ -130,13 +143,13 @@ static void jtagtap_tms_seq_no_delay(uint32_t tms_states, const size_t clock_cyc
}
}

static void jtagtap_tms_seq(const uint32_t tms_states, const size_t ticks)
static void jtagtap_tms_seq(const uint32_t tms_states, const size_t clock_cycles)
{
gpio_set(TDI_PORT, TDI_PIN);
if (target_clk_divider != UINT32_MAX)
jtagtap_tms_seq_clk_delay(tms_states, ticks);
jtagtap_tms_seq_clk_delay(tms_states, clock_cycles);
else
jtagtap_tms_seq_no_delay(tms_states, ticks);
jtagtap_tms_seq_no_delay(tms_states, clock_cycles);
}

static void jtagtap_tdi_tdo_seq_clk_delay(
Expand All @@ -153,8 +166,7 @@ static void jtagtap_tdi_tdo_seq_clk_delay(
gpio_set_val(TDI_PORT, TDI_PIN, data_in[byte] & (1U << bit));
/* Start the clock cycle */
gpio_set(TCK_PORT, TCK_PIN);
for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter)
continue;
platform_delay_busy(target_clk_divider);
/* If TDO is high, store a 1 in the appropriate position in the value being accumulated */
if (gpio_get(TDO_PORT, TDO_PIN))
value |= 1U << bit;
Expand All @@ -164,8 +176,7 @@ static void jtagtap_tdi_tdo_seq_clk_delay(
}
/* Finish the clock cycle */
gpio_clear(TCK_PORT, TCK_PIN);
for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter)
continue;
platform_delay_busy(target_clk_divider);
}
/* If clock_cycles is not divisible by 8, we have some extra data to write back here. */
if (clock_cycles & 7U) {
Expand All @@ -174,6 +185,9 @@ static void jtagtap_tdi_tdo_seq_clk_delay(
}
}

static void jtagtap_tdi_tdo_seq_no_delay(const uint8_t *const data_in, uint8_t *const data_out, const bool final_tms,
const size_t clock_cycles) __attribute__((optimize(3)));

static void jtagtap_tdi_tdo_seq_no_delay(
const uint8_t *const data_in, uint8_t *const data_out, const bool final_tms, const size_t clock_cycles)
{
Expand Down Expand Up @@ -223,6 +237,9 @@ static void jtagtap_tdi_tdo_seq_no_delay(
static void jtagtap_tdi_tdo_seq(
uint8_t *const data_out, const bool final_tms, const uint8_t *const data_in, size_t clock_cycles)
{
/* In case the callsite passes NULL for data_out, don't bother sampling TDO */
if (!data_out)
return jtagtap_tdi_seq(final_tms, data_in, clock_cycles);
gpio_clear(TMS_PORT, TMS_PIN);
gpio_clear(TDI_PORT, TDI_PIN);
if (target_clk_divider != UINT32_MAX)
Expand All @@ -241,15 +258,16 @@ static void jtagtap_tdi_seq_clk_delay(const uint8_t *const data_in, const bool f
/* Set up the TDI pin and start the clock cycle */
gpio_set_val(TDI_PORT, TDI_PIN, data_in[byte] & (1U << bit));
gpio_set(TCK_PORT, TCK_PIN);
for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter)
continue;
platform_delay_busy(target_clk_divider);
/* Finish the clock cycle */
gpio_clear(TCK_PORT, TCK_PIN);
for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter)
continue;
platform_delay_busy(target_clk_divider);
}
}

static void jtagtap_tdi_seq_no_delay(const uint8_t *const data_in, const bool final_tms, size_t clock_cycles)
__attribute__((optimize(3)));

static void jtagtap_tdi_seq_no_delay(const uint8_t *const data_in, const bool final_tms, size_t clock_cycles)
{
for (size_t cycle = 0; cycle < clock_cycles;) {
Expand Down Expand Up @@ -294,20 +312,23 @@ static void jtagtap_cycle_clk_delay(const size_t clock_cycles)
{
for (size_t cycle = 0; cycle < clock_cycles; ++cycle) {
gpio_set(TCK_PORT, TCK_PIN);
for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter)
continue;
platform_delay_busy(target_clk_divider);
gpio_clear(TCK_PORT, TCK_PIN);
for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter)
continue;
platform_delay_busy(target_clk_divider);
}
}

static void jtagtap_cycle_no_delay(const size_t clock_cycles) __attribute__((optimize(3)));

static void jtagtap_cycle_no_delay(const size_t clock_cycles)
{
for (size_t cycle = 0; cycle < clock_cycles; ++cycle) {
gpio_set(TCK_PORT, TCK_PIN);
/* Stretch the clock high time */
__asm__ volatile("nop" ::: "memory");
gpio_clear(TCK_PORT, TCK_PIN);
/* Stretch the clock low time */
__asm__ volatile("nop" ::: "memory");
}
}

Expand Down
9 changes: 0 additions & 9 deletions src/platforms/common/stm32/gpio.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,6 @@ static inline void bmp_gpio_set(const uint32_t gpioport, const uint16_t gpios)
{
/* NOLINTNEXTLINE(clang-diagnostic-int-to-pointer-cast) */
GPIO_BSRR(gpioport) = gpios;
#if defined(STM32F4) || defined(STM32F7)
/* FIXME: Check if doubling is still needed */
/* NOLINTNEXTLINE(clang-diagnostic-int-to-pointer-cast) */
GPIO_BSRR(gpioport) = gpios;
#endif
}

#define gpio_set bmp_gpio_set
Expand All @@ -44,10 +39,6 @@ static inline void bmp_gpio_clear(const uint32_t gpioport, const uint16_t gpios)
/* NOLINTNEXTLINE(clang-diagnostic-int-to-pointer-cast) */
GPIO_BRR(gpioport) = gpios;
#else
#if defined(STM32F4) || defined(STM32F7)
/* NOLINTNEXTLINE(clang-diagnostic-int-to-pointer-cast) */
GPIO_BSRR(gpioport) = gpios << 16U;
#endif
/* NOLINTNEXTLINE(clang-diagnostic-int-to-pointer-cast) */
GPIO_BSRR(gpioport) = gpios << 16U;
#endif
Expand Down
51 changes: 49 additions & 2 deletions src/platforms/common/stm32/timing_stm32.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@

bool running_status = false;
static volatile uint32_t time_ms = 0;
uint32_t target_clk_divider = 0;
uint32_t target_clk_divider = UINT32_MAX;

static size_t morse_tick = 0;
#if defined(PLATFORM_HAS_POWER_SWITCH) && defined(STM32F1)
Expand Down Expand Up @@ -140,9 +140,23 @@ uint32_t platform_time_ms(void)
* per delay loop count with 2 delay loops per clock
*/

#if defined(STM32F4)
/* Values for STM32F411 at 96 MHz */
#define USED_SWD_CYCLES 12
#define CYCLES_PER_CNT 4
#elif defined(STM32F1)
/* Values for STM32F103 at 72 MHz */
#define USED_SWD_CYCLES 18
#define CYCLES_PER_CNT 4
#elif defined(STM32F0)
/* Values for STM32F072 at 48 MHz */
#define USED_SWD_CYCLES 16
#define CYCLES_PER_CNT 6
#else
/* Inherit defaults for other platforms (F3, F7) */
#define USED_SWD_CYCLES 22
#define CYCLES_PER_CNT 10
#endif

void platform_max_frequency_set(const uint32_t frequency)
{
Expand Down Expand Up @@ -172,6 +186,11 @@ void platform_max_frequency_set(const uint32_t frequency)
target_clk_divider = UINT32_MAX;
return;
}
/* A zero loops delay will underflow and hang in platform_delay_busy() */
if (divisor == 0U) {
target_clk_divider = UINT32_MAX;
return;
}
divisor /= 2U;
target_clk_divider = divisor / (CYCLES_PER_CNT * frequency);
if (target_clk_divider * (CYCLES_PER_CNT * frequency) < divisor)
Expand All @@ -194,8 +213,36 @@ uint32_t platform_max_frequency_get(void)
const uint32_t ratio = (target_clk_divider * BITBANG_DIVIDER_FACTOR) + BITBANG_DIVIDER_OFFSET;
return rcc_ahb_frequency / ratio;
#else
if (target_clk_divider == UINT32_MAX)
return rcc_ahb_frequency / USED_SWD_CYCLES;
uint32_t result = rcc_ahb_frequency;
result /= USED_SWD_CYCLES + CYCLES_PER_CNT * target_clk_divider;
result /= USED_SWD_CYCLES + CYCLES_PER_CNT * target_clk_divider * 2U;
return result;
#endif
}

/* Busy-looping delay for GPIO bitbanging operations. SUBS+BNE.N take 4 cycles. */
void platform_delay_busy(const uint32_t loops)
{
/* Avoid using `volatile` variables which incur stack accesses */
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The cppreference page on C's volatile does not mention anything about forced stack usage and we've never seen it imply any such - are we sure that is what the compiler's doing here? Either way, we would strongly prefer you to use a for loop not a do-while here. This is a for loop in disguise.

The disappearance of the loop when using continue; without volatile can be entirely explained not so much by DCE (it's the result, not the cause) but by there being no observable side effects thus allowing the compiler to assume that nothing of value is done by the loop. This is the reason for using volatile in the jtagtap code. Perhaps we can use volatile register to force the compiler to consider the operation to have observable side-effects but also keep the loop counter register-bound?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed volatile register was the first thing I tried, only to be yelled at by the compiler to swap qualifiers... and identical codegen. I specifically wanted to avoid extraneous ldr/str of loop counters, so I monitored Puncover and objdump -CSd. Maybe Compiler Explorer can shed some light on why this codegens this way.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh how interesting - ok, that'll be an interesting session in CE then as yes, that might well provide some insight into how and why the compiler's choosing to do what it does.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably qualifying a variable as volatile forces the compiler to assume that it should be observable elsewhere, and hence needs to be pinned to SRAM (if only on the stack). This directly contradicts with the intent of register-allocated counters (which debuggers don't display due to "optimized away" but you can always dump the gp_regs). Something "dead store elimination".
And the for-loop form is longer and has an extra jump, I didn't subjectively like it; objectively it has to fit into Flash prefetch buffer which is a few (4) 64-bit lines on F103 (?) to not incur wait-states of 72MHz MCU against 24MHz flash on random jumps.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From our perspective re the for loop part, the for loop form should code gen to the same or very near the same as the do-while (perhaps needs the condition expressed as --counter > 0 and --counter in the update step removed to do it) while being a much clearer expression of your intent. Think this is very much going to be "take it to Compiler Explorer and tinker for an hour" to find the right thing to do and the right way to express this.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your suggestion of --counter > 0 gets the codegen I want and the C idiomatic form you look to preserve, thanks.
Updated the three places accordingly.
m3/m4/m7 still get the SUBS+BNE.N, m0 (f072) gets subs; cmp; bne.n so its divisor timings will be off.

#if 0
register uint32_t i = loops;
do {
/*
* A "tactical" single NOP takes 0-1 cycles on Cortex-M0/M3/M4/M7
* and avoids DCE, but consumes 2 bytes of flash, amplified by static/inline.
* A normal `continue` in for/while/do-loops with no side-effects
* makes the whole loop disappear to DCE at higher than -O1.
*/
__asm__("nop");
} while (--i > 0U);
#else
/*
* Another version which still assembles to SUBS+BNE.N; the NOP can be eliminated
* in favor of an empty inline asm/volatile block if it's enough to suppress DCE.
* Note that the predecrement has to be merged into the condition expression.
*/
for (register uint32_t i = loops; --i > 0U;)
__asm__("");
#endif
}
Loading