diff --git a/subtitles.go b/subtitles.go index 97fb1c7..0a4997d 100644 --- a/subtitles.go +++ b/subtitles.go @@ -427,6 +427,7 @@ func (l Line) String() string { type LineItem struct { InlineStyle *StyleAttributes Style *Style + StartAt time.Duration Text string } diff --git a/webvtt.go b/webvtt.go index 3b1f5e4..a402e5c 100644 --- a/webvtt.go +++ b/webvtt.go @@ -34,6 +34,7 @@ var ( bytesWebVTTItalicStartTag = []byte("") bytesWebVTTTimeBoundariesSeparator = []byte(webvttTimeBoundariesSeparator) webVTTRegexpTag = regexp.MustCompile(`()`) + inlineTimestampRegexp = regexp.MustCompile(`<((?:\d{2,}:)?\d{2}:\d{2}\.\d{3})>`) webVTTEscaper = strings.NewReplacer("&", "&", "<", "<") webVTTUnescaper = strings.NewReplacer("&", "&", "<", "<") ) @@ -357,29 +358,80 @@ func parseTextWebVTT(i string) (o Line) { } case html.TextToken: - if s := strings.TrimSpace(string(tr.Raw())); s != "" { - // Get style attribute - var sa *StyleAttributes - if len(webVTTTagStack) > 0 { - tags := make([]WebVTTTag, len(webVTTTagStack)) - copy(tags, webVTTTagStack) - sa = &StyleAttributes{ - WebVTTTags: tags, - } - sa.propagateWebVTTAttributes() + s := string(tr.Raw()) + // Get style attribute + var sa *StyleAttributes + if len(webVTTTagStack) > 0 { + tags := make([]WebVTTTag, len(webVTTTagStack)) + copy(tags, webVTTTagStack) + sa = &StyleAttributes{ + WebVTTTags: tags, } - - // Append item - o.Items = append(o.Items, LineItem{ - InlineStyle: sa, - Text: unescapeWebVTT(s), - }) + sa.propagateWebVTTAttributes() } + + // Append item + items := parseTextWebVTTText(sa, s) + o.Items = append(o.Items, items...) } } return } +func parseTextWebVTTText(sa *StyleAttributes, line string) []LineItem { + var ret []LineItem + + // split the line by inline timestamps + indexes := inlineTimestampRegexp.FindAllStringSubmatchIndex(line, -1) + if len(indexes) > 0 { + // get the text before the first timestamp + s := strings.TrimSpace(line[:indexes[0][0]]) + if s != "" { + ret = append(ret, LineItem{ + InlineStyle: sa, + Text: unescapeWebVTT(s), + }) + } + + for i, match := range indexes { + // get the text between the timestamps + endIndex := len(line) + if i+1 < len(indexes) { + endIndex = indexes[i+1][0] + } + s := strings.TrimSpace(line[match[1]:endIndex]) + if s == "" { + continue + } + + // get the timestamp + ts := line[match[2]:match[3]] + + // Parse timestamp + t, err := parseDurationWebVTT(ts) + if err != nil { + log.Printf("astisub: parsing webvtt duration %s failed, ignoring: %v", ts, err) + } + + ret = append(ret, LineItem{ + InlineStyle: sa, + Text: unescapeWebVTT(s), + StartAt: t, + }) + } + } else { + s := strings.TrimSpace(line) + if s != "" { + ret = append(ret, LineItem{ + InlineStyle: sa, + Text: unescapeWebVTT(s), + }) + } + } + + return ret +} + // formatDurationWebVTT formats a .vtt duration func formatDurationWebVTT(i time.Duration) string { return formatDuration(i, ".", 3) @@ -559,6 +611,11 @@ func (l Line) webVTTBytes() (c []byte) { } func (li LineItem) webVTTBytes() (c []byte) { + // Add timestamp + if li.StartAt > 0 { + c = append(c, []byte("<"+formatDurationWebVTT(li.StartAt)+">")...) + } + // Get color var color string if li.InlineStyle != nil && li.InlineStyle.TTMLColor != nil { diff --git a/webvtt_internal_test.go b/webvtt_internal_test.go index ce97faa..495f5d7 100644 --- a/webvtt_internal_test.go +++ b/webvtt_internal_test.go @@ -36,6 +36,35 @@ func TestParseTextWebVTT(t *testing.T) { assert.Equal(t, 1, len(s.Items)) assert.Equal(t, "Incorrect end tag", s.Items[0].Text) }) + + t.Run("When inline timestamps are included", func(t *testing.T) { + testData := `<00:01:01.000>With inline <00:01:02.000>timestamps` + + s := parseTextWebVTT(testData) + assert.Equal(t, 2, len(s.Items)) + assert.Equal(t, "With inline", s.Items[0].Text) + assert.Equal(t, time.Minute+time.Second, s.Items[0].StartAt) + assert.Equal(t, "timestamps", s.Items[1].Text) + assert.Equal(t, time.Minute+2*time.Second, s.Items[1].StartAt) + }) + + t.Run("When inline timestamps together", func(t *testing.T) { + testData := `<00:01:01.000><00:01:02.000>With timestamp tags together` + + s := parseTextWebVTT(testData) + assert.Equal(t, 1, len(s.Items)) + assert.Equal(t, "With timestamp tags together", s.Items[0].Text) + assert.Equal(t, time.Minute+2*time.Second, s.Items[0].StartAt) + }) + + t.Run("When inline timestamps is at end", func(t *testing.T) { + testData := `With end timestamp<00:01:02.000>` + + s := parseTextWebVTT(testData) + assert.Equal(t, 1, len(s.Items)) + assert.Equal(t, "With end timestamp", s.Items[0].Text) + assert.Equal(t, time.Duration(0), s.Items[0].StartAt) + }) } func TestTimestampMap(t *testing.T) { diff --git a/webvtt_test.go b/webvtt_test.go index 03993dd..b1b5b7c 100644 --- a/webvtt_test.go +++ b/webvtt_test.go @@ -179,12 +179,15 @@ func TestWebVTTTags(t *testing.T) { Text here 00:05:00.000 --> 00:06:00.000 - Joe says something Bob says something` + Joe says something Bob says something + + 00:06:00.000 --> 00:07:00.000 + Text with a <00:06:30.000>timestamp in the middle` s, err := astisub.ReadFromWebVTT(strings.NewReader(testData)) require.NoError(t, err) - require.Len(t, s.Items, 5) + require.Len(t, s.Items, 6) b := &bytes.Buffer{} err = s.WriteToWebVTT(b) @@ -210,5 +213,9 @@ func TestWebVTTTags(t *testing.T) { 5 00:05:00.000 --> 00:06:00.000 Joe says something Bob says something + +6 +00:06:00.000 --> 00:07:00.000 +Text with a <00:06:30.000>timestamp in the middle `, b.String()) }