Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improved detect silence #745

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
175 changes: 132 additions & 43 deletions pydub/silence.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,73 +2,162 @@
Various functions for finding/manipulating silence in AudioSegments
"""
import itertools
import numpy as np

from .utils import db_to_float


def detect_silence(audio_segment, min_silence_len=1000, silence_thresh=-16, seek_step=1):
def _convert_to_numpy(audio_segment):
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about adding a property to AudioSegment?
Something like:

@property
def as_numpy(self):

"""
Returns a numpy array view of the raw samples of an AudioSegment,
with shape (number of frames, channels).

Does not allocate any additional memory.

audio_segment - the segment to convert into a numpy array
"""
dtype = {
1: np.int8,
2: np.int16,
4: np.int32
}[audio_segment.sample_width]
x = np.frombuffer(audio_segment.raw_data, dtype=dtype).reshape(-1, audio_segment.channels)
return x


def detect_silence(audio_segment, min_silence_len=1000, silence_thresh=-16, seek_step=1, max_buffer_size_kb=100*1024):
"""
Returns a list of all silent sections [start, end] in milliseconds of audio_segment.
Inverse of detect_nonsilent()
Inverse of detect_nonsilent().

audio_segment - the segment to find silence in
min_silence_len - the minimum length for any silent section
silence_thresh - the upper bound for how quiet is silent in dFBS
seek_step - step size for interating over the segment in ms
max_buffer_size_kb - the maximum size of internally allocated buffers in KiB
"""
seg_len = len(audio_segment)

raw_data = _convert_to_numpy(audio_segment)
min_silence_ms = min_silence_len
silence_threshold_db = silence_thresh
seek_step_ms = seek_step

seg_len_ms = len(audio_segment)
frames_per_ms = audio_segment.frame_rate / 1000

assert raw_data.shape[0] == audio_segment.frame_count()
assert raw_data.shape[1] == audio_segment.channels

max_frames_in_slice = int(np.ceil(min_silence_ms * frames_per_ms))

# determine number of frames in computation window buffer
if max_buffer_size_kb >= 0:
bytes_per_frame = 8
frames_per_kb = 1024 // bytes_per_frame
buffer_len = max_buffer_size_kb * frames_per_kb
# empirical testing shows that we need approximately 4 times as much memory
# as buffer_len would suggest, probably because numpy allocates additional buffers
# in the background during computations; we correct for this by adjusting buffer_len accordingly
correction_constant = 4
buffer_len //= correction_constant
if buffer_len < max_frames_in_slice:
min_buffer_size = int(np.ceil(max_frames_in_slice * bytes_per_frame / 1024)) * correction_constant
raise ValueError("Buffer is too small, must be at least {} for the given {}" % min_buffer_size, min_silence_ms)
else:
buffer_len = len(raw_data) # no restrictions!


# you can't have a silent portion of a sound that is longer than the sound
if seg_len < min_silence_len:
if seg_len_ms < min_silence_ms:
return []

# convert silence threshold to a float value (so we can compare it to rms)
silence_thresh = db_to_float(silence_thresh) * audio_segment.max_possible_amplitude

# find silence and add start and end indicies to the to_cut list
silence_starts = []
normalization_const = float(audio_segment.max_possible_amplitude)
# normalization_const = 1.
silence_thresh = db_to_float(silence_threshold_db) * audio_segment.max_possible_amplitude / normalization_const

# check successive (1 sec by default) chunk of sound for silence
# try a chunk at every "seek step" (or every chunk for a seek step == 1)
last_slice_start = seg_len - min_silence_len
slice_starts = range(0, last_slice_start + 1, seek_step)
last_slice_start = seg_len_ms - min_silence_ms
slice_starts = range(0, last_slice_start + 1, seek_step_ms)

# guarantee last_slice_start is included in the range
# to make sure the last portion of the audio is searched
if last_slice_start % seek_step:
if last_slice_start % seek_step_ms:
slice_starts = itertools.chain(slice_starts, [last_slice_start])

for i in slice_starts:
audio_slice = audio_segment[i:i + min_silence_len]
if audio_slice.rms <= silence_thresh:
silence_starts.append(i)

# short circuit when there is no silence
if not silence_starts:
return []

# combine the silence we detected into ranges (start ms - end ms)
# list of all continuous regions of silence (start ms - end ms)
silent_ranges = []

prev_i = silence_starts.pop(0)
current_range_start = prev_i

for silence_start_i in silence_starts:
continuous = (silence_start_i == prev_i + seek_step)

# sometimes two small blips are enough for one particular slice to be
# non-silent, despite the silence all running together. Just combine
# the two overlapping silent ranges.
silence_has_gap = silence_start_i > (prev_i + min_silence_len)

if not continuous and silence_has_gap:
silent_ranges.append([current_range_start,
prev_i + min_silence_len])
current_range_start = silence_start_i
prev_i = silence_start_i

silent_ranges.append([current_range_start,
prev_i + min_silence_len])

prev_silent_ms = None
current_range_start = None

# load first window into buffer
# the cumsq_per_frame buffer holds the cumulative sum of means of squares over channels for each frame,
# normalized by max_possible_amplitude (i.e., all possible values x are 0 <= x <= 1) - this is
# to prevent cumulative sums of square from exceeding representable values
cumsq_per_frame = np.concatenate((
[0.],
np.cumsum(
np.mean((raw_data[0:buffer_len].astype(np.float64))**2, axis=-1)
/ (normalization_const**2)
)
))
# keep track of the frames currently in the buffer
buffer_offset = 0
buffer_end = buffer_len

for slice_start_ms in slice_starts:
slice_start = int(slice_start_ms * frames_per_ms)
slice_end = min(int((slice_start_ms + min_silence_ms) * frames_per_ms), len(raw_data))
assert slice_end <= len(raw_data)
# if the frame_rate is not divisible by min_silence_ms, we may have frames with slightly varying lengths
# so we compute the actual length of the concrete slice here
frames_in_slice = slice_end - slice_start

if slice_end > buffer_end: # we ran out of buffer; load next window into buffer
cumsq_per_frame = np.concatenate((
[0.],
np.cumsum(
np.mean(
(raw_data[slice_start:slice_start + buffer_len].astype(np.float64))**2,
axis=-1
) / (normalization_const**2)
)

))
buffer_offset = slice_start
buffer_end = buffer_offset + buffer_len

# compute the RMS for the current slice from the cumulative sums of squares in the buffer
slice_msq = cumsq_per_frame[slice_end - buffer_offset] - cumsq_per_frame[slice_start - buffer_offset]
rms = np.sqrt(slice_msq / frames_in_slice)

if rms <= silence_thresh:
# silence_starts.append(slice_start_ms)
# current slice is silent; combine with preceeding silent slice if no nonsilent gap
if current_range_start is None:
current_range_start = slice_start_ms
else:
continuous = (slice_start_ms == prev_silent_ms + seek_step)

# sometimes two small blips are enough for one particular slice to be
# non-silent, despite the silence all running together. Just combine
# the two overlapping silent ranges.
silence_has_gap = slice_start_ms > (prev_silent_ms + min_silence_len)

if not continuous and silence_has_gap:
silent_ranges.append([
current_range_start,
prev_silent_ms + min_silence_len
])
current_range_start = slice_start_ms

prev_silent_ms = slice_start_ms

if current_range_start is not None:
assert prev_silent_ms is not None
silent_ranges.append([current_range_start,
prev_silent_ms + min_silence_len])

return silent_ranges

Expand Down
3 changes: 3 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
keywords='audio sound high-level',
url='http://pydub.com',
packages=['pydub'],
install_requires=[
"numpy >= 1.16, < 2.0",
],
long_description=__doc__,
classifiers=[
'Development Status :: 5 - Production/Stable',
Expand Down
40 changes: 40 additions & 0 deletions test/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -1153,6 +1153,46 @@ def test_realistic_audio(self):
self.assertTrue(start > prev_end)
prev_end = end

def test_detect_silence_noncontinous_without_gap(self):
raw_data = b"\xFF\x7F\xFF\x7F\x00\x00\x00\x00\xFF\x7F\x00\x00\xFF\x7F\x00\x00\x00\x00\xFF\x7F\xFF\x7F"
seg = AudioSegment(raw_data, frame_rate=1000, channels=1, sample_width=2)

self.assertEqual(seg.frame_count(), 11.0)
self.assertEqual(len(seg), 11)

silent_ranges = detect_silence(seg, min_silence_len=3, silence_thresh=-4, seek_step=1)
self.assertEqual(silent_ranges, [[1, 10]])

def test_detect_silence_noncontinuous_with_gap(self):
raw_data = b"\xFF\x7F\xFF\x7F\x00\x00\x00\x00\xFF\x7F\xFF\x7F\xFF\x7F\x00\x00\x00\x00\xFF\x7F\xFF\x7F"
seg = AudioSegment(raw_data, frame_rate=1000, channels=1, sample_width=2)

self.assertEqual(seg.frame_count(), 11.0)
self.assertEqual(len(seg), 11)

silent_ranges = detect_silence(seg, min_silence_len=3, silence_thresh=-4, seek_step=1)
self.assertEqual(silent_ranges, [[1, 5], [6, 10]])

def test_detect_silence_noncontinuous_without_gap_last_slice(self):
raw_data = b"\xFF\x7F\xFF\x7F\x00\x00\x00\x00\xFF\x7F\xFF\x7F\x00\x00\x00\x00"
seg = AudioSegment(raw_data, frame_rate=1000, channels=1, sample_width=2)

self.assertEqual(seg.frame_count(), 8.0)
self.assertEqual(len(seg), 8)

silent_ranges = detect_silence(seg, min_silence_len=3, silence_thresh=-4, seek_step=2)
self.assertEqual(silent_ranges, [[2, 8]])

def test_detect_silence_noncontinuous_with_gap_last_slice(self):
raw_data = b"\xFF\x7F\xFF\x7F\x00\x00\x00\x00\xFF\x7F\xFF\x7F\xFF\x7F\x00\x00\xFF\x7F\x00\x00"
seg = AudioSegment(raw_data, frame_rate=1000, channels=1, sample_width=2)

self.assertEqual(seg.frame_count(), 10.0)
self.assertEqual(len(seg), 10)

silent_ranges = detect_silence(seg, min_silence_len=3, silence_thresh=-4, seek_step=2)
self.assertEqual(silent_ranges, [[2, 5], [7, 10]])


class GeneratorTests(unittest.TestCase):

Expand Down