Skip to content

Commit

Permalink
fix(Captions): use proper XML parsing (#468)
Browse files Browse the repository at this point in the history
  • Loading branch information
Betree authored Jul 17, 2024
1 parent c7efee0 commit 78e2851
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 21 deletions.
39 changes: 19 additions & 20 deletions apps/cf/lib/videos/captions_fetcher_youtube.ex
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ defmodule CF.Videos.CaptionsFetcherYoutube do
@behaviour CF.Videos.CaptionsFetcher

require Logger
import SweetXml

@impl true
def fetch(%{youtube_id: youtube_id, language: language}) do
Expand Down Expand Up @@ -69,31 +70,29 @@ defmodule CF.Videos.CaptionsFetcherYoutube do

defp process_transcript(transcript) do
transcript
|> String.replace(~r/^<\?xml version="1.0" encoding="utf-8"\?><transcript>/, "")
|> String.replace("</transcript>", "")
|> String.split("</text>")
|> Enum.filter(&(String.trim(&1) != ""))
|> Enum.map(&process_line/1)
|> SweetXml.xpath(
~x"//transcript/text"l,
text: ~x"./text()"s |> transform_by(&clean_text/1),
start: ~x"./@start"s |> transform_by(&parse_float/1),
duration: ~x"./@dur"os |> transform_by(&parse_float/1)
)
|> Enum.filter(fn %{text: text, start: start} ->
start != nil and text != nil and text != ""
end)
end

defp process_line(line) do
%{"start" => start} = Regex.named_captures(~r/start="(?<start>[\d.]+)"/, line)
%{"dur" => dur} = Regex.named_captures(~r/dur="(?<dur>[\d.]+)"/, line)

text =
line
|> String.replace("&amp;", "&")
|> String.replace(~r/<text.+>/, "")
|> String.replace(~r"</?[^>]+(>|$)", "")
|> HtmlEntities.decode()
|> String.trim()

%{start: parse_float(start), duration: parse_float(dur), text: text}
defp clean_text(text) do
text
|> String.replace("&amp;", "&")
|> HtmlEntities.decode()
|> String.trim()
end

defp parse_float(val) do
{num, _} = Float.parse(val)
num
case Float.parse(val) do
{num, _} -> num
_ -> nil
end
end

# Below is an implementation using the official YouTube API, but it requires OAuth2 authentication.
Expand Down
1 change: 1 addition & 0 deletions apps/cf/mix.exs
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ defmodule CF.Mixfile do
{:yaml_elixir, "~> 2.9.0"},
{:jason, "~> 1.4"},
{:openai, "~> 0.6.1"},
{:sweet_xml, "~> 0.7.4"},

# ---- Internal ----
{:db, in_umbrella: true},
Expand Down
2 changes: 1 addition & 1 deletion apps/cf_graphql/lib/schema/types/video_caption.ex
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,6 @@ defmodule CF.Graphql.Schema.Types.VideoCaption do
@desc "Caption start time (in seconds)"
field(:start, non_null(:float))
@desc "Caption duration (in seconds)"
field(:duration, non_null(:float))
field(:duration, :float)
end
end

0 comments on commit 78e2851

Please sign in to comment.