diff --git a/subtitles.go b/subtitles.go index cab1750..7541776 100644 --- a/subtitles.go +++ b/subtitles.go @@ -426,6 +426,7 @@ func (l Line) String() string { // LineItem represents a formatted line item type LineItem struct { InlineStyle *StyleAttributes + StartAt time.Duration Style *Style Text string } diff --git a/webvtt.go b/webvtt.go index 3b1f5e4..c0d62b5 100644 --- a/webvtt.go +++ b/webvtt.go @@ -33,6 +33,7 @@ var ( bytesWebVTTItalicEndTag = []byte("") bytesWebVTTItalicStartTag = []byte("") bytesWebVTTTimeBoundariesSeparator = []byte(webvttTimeBoundariesSeparator) + webVTTRegexpInlineTimestamp = regexp.MustCompile(`<((?:\d{2,}:)?\d{2}:\d{2}\.\d{3})>`) webVTTRegexpTag = regexp.MustCompile(`()`) webVTTEscaper = strings.NewReplacer("&", "&", "<", "<") webVTTUnescaper = strings.NewReplacer("&", "&", "<", "<") @@ -357,26 +358,70 @@ func parseTextWebVTT(i string) (o Line) { } case html.TextToken: - if s := strings.TrimSpace(string(tr.Raw())); s != "" { - // Get style attribute - var sa *StyleAttributes - if len(webVTTTagStack) > 0 { - tags := make([]WebVTTTag, len(webVTTTagStack)) - copy(tags, webVTTTagStack) - sa = &StyleAttributes{ - WebVTTTags: tags, - } - sa.propagateWebVTTAttributes() + // Get style attribute + var sa *StyleAttributes + if len(webVTTTagStack) > 0 { + tags := make([]WebVTTTag, len(webVTTTagStack)) + copy(tags, webVTTTagStack) + sa = &StyleAttributes{ + WebVTTTags: tags, } - - // Append item - o.Items = append(o.Items, LineItem{ - InlineStyle: sa, - Text: unescapeWebVTT(s), - }) + sa.propagateWebVTTAttributes() } + + // Append items + o.Items = append(o.Items, parseTextWebVTTTextToken(sa, string(tr.Raw()))...) + } + } + return +} + +func parseTextWebVTTTextToken(sa *StyleAttributes, line string) (ret []LineItem) { + // split the line by inline timestamps + indexes := webVTTRegexpInlineTimestamp.FindAllStringSubmatchIndex(line, -1) + + if len(indexes) == 0 { + if s := strings.TrimSpace(line); s != "" { + return []LineItem{{ + InlineStyle: sa, + Text: unescapeWebVTT(s), + }} + } + return + } + + // get the text before the first timestamp + if s := strings.TrimSpace(line[:indexes[0][0]]); s != "" { + ret = append(ret, LineItem{ + InlineStyle: sa, + Text: unescapeWebVTT(s), + }) + } + + for i, match := range indexes { + // get the text between the timestamps + endIndex := len(line) + if i+1 < len(indexes) { + endIndex = indexes[i+1][0] + } + s := strings.TrimSpace(line[match[1]:endIndex]) + if s == "" { + continue } + + // Parse timestamp + t, err := parseDurationWebVTT(line[match[2]:match[3]]) + if err != nil { + log.Printf("astisub: parsing webvtt duration %s failed, ignoring: %v", line[match[2]:match[3]], err) + } + + ret = append(ret, LineItem{ + InlineStyle: sa, + StartAt: t, + Text: unescapeWebVTT(s), + }) } + return } @@ -559,6 +604,11 @@ func (l Line) webVTTBytes() (c []byte) { } func (li LineItem) webVTTBytes() (c []byte) { + // Add timestamp + if li.StartAt > 0 { + c = append(c, []byte("<"+formatDurationWebVTT(li.StartAt)+">")...) + } + // Get color var color string if li.InlineStyle != nil && li.InlineStyle.TTMLColor != nil { diff --git a/webvtt_internal_test.go b/webvtt_internal_test.go index ce97faa..495f5d7 100644 --- a/webvtt_internal_test.go +++ b/webvtt_internal_test.go @@ -36,6 +36,35 @@ func TestParseTextWebVTT(t *testing.T) { assert.Equal(t, 1, len(s.Items)) assert.Equal(t, "Incorrect end tag", s.Items[0].Text) }) + + t.Run("When inline timestamps are included", func(t *testing.T) { + testData := `<00:01:01.000>With inline <00:01:02.000>timestamps` + + s := parseTextWebVTT(testData) + assert.Equal(t, 2, len(s.Items)) + assert.Equal(t, "With inline", s.Items[0].Text) + assert.Equal(t, time.Minute+time.Second, s.Items[0].StartAt) + assert.Equal(t, "timestamps", s.Items[1].Text) + assert.Equal(t, time.Minute+2*time.Second, s.Items[1].StartAt) + }) + + t.Run("When inline timestamps together", func(t *testing.T) { + testData := `<00:01:01.000><00:01:02.000>With timestamp tags together` + + s := parseTextWebVTT(testData) + assert.Equal(t, 1, len(s.Items)) + assert.Equal(t, "With timestamp tags together", s.Items[0].Text) + assert.Equal(t, time.Minute+2*time.Second, s.Items[0].StartAt) + }) + + t.Run("When inline timestamps is at end", func(t *testing.T) { + testData := `With end timestamp<00:01:02.000>` + + s := parseTextWebVTT(testData) + assert.Equal(t, 1, len(s.Items)) + assert.Equal(t, "With end timestamp", s.Items[0].Text) + assert.Equal(t, time.Duration(0), s.Items[0].StartAt) + }) } func TestTimestampMap(t *testing.T) { diff --git a/webvtt_test.go b/webvtt_test.go index 03993dd..b1b5b7c 100644 --- a/webvtt_test.go +++ b/webvtt_test.go @@ -179,12 +179,15 @@ func TestWebVTTTags(t *testing.T) { Text here 00:05:00.000 --> 00:06:00.000 - Joe says something Bob says something` + Joe says something Bob says something + + 00:06:00.000 --> 00:07:00.000 + Text with a <00:06:30.000>timestamp in the middle` s, err := astisub.ReadFromWebVTT(strings.NewReader(testData)) require.NoError(t, err) - require.Len(t, s.Items, 5) + require.Len(t, s.Items, 6) b := &bytes.Buffer{} err = s.WriteToWebVTT(b) @@ -210,5 +213,9 @@ func TestWebVTTTags(t *testing.T) { 5 00:05:00.000 --> 00:06:00.000 Joe says something Bob says something + +6 +00:06:00.000 --> 00:07:00.000 +Text with a <00:06:30.000>timestamp in the middle `, b.String()) }