Skip to content

Commit

Permalink
feat: Community videos scraping (#2441)
Browse files Browse the repository at this point in the history
* add deserialization

* add serialization

* swap them

* Update data/cookbook/deserialise-from-json/00-deserialize-yojson.ml

Co-authored-by: sabine <[email protected]>

* Update data/cookbook/deserialise-from-json/00-deserialize-yojson.ml

Co-authored-by: sabine <[email protected]>

* Update data/cookbook/deserialise-from-json/00-deserialize-yojson.ml

Co-authored-by: sabine <[email protected]>

* Update data/cookbook/serialise-to-json/00-serialize-yojson.ml

Co-authored-by: sabine <[email protected]>

* Update data/cookbook/serialise-to-json/00-serialize-yojson.ml

Co-authored-by: sabine <[email protected]>

* Update data/cookbook/serialise-to-json/00-serialize-yojson.ml

Co-authored-by: sabine <[email protected]>

* Data type and scraping for YouTube video MRSS

* YouTube page

* Add UI

* Formatting

* Per source filters

* Only use ocaml as scrape filter

* fix typo

* Scrape and merge

* Formatting

* Generate asset/video.rss

* Single type for Youtube

* minor formatting, etc.

* Revert "minor formatting, etc."

This reverts commit ce6cfc1.

* Use better youtube urls

* Formatting

* Rebase

* Renaming

* Formatting

* Import watch.ocaml.org video list

* WIP

* Process watch.ocaml.org videos

* Aggregate youtube and watch.ocaml.org

* Everything in the right place

* Misc

* remove video route, videos will render under OCaml Planet in a different PR

---------

Co-authored-by: gpopides <[email protected]>
Co-authored-by: gpopides <[email protected]>
Co-authored-by: sabine <[email protected]>
Co-authored-by: Cuihtlauac ALVARADO <[email protected]>
Co-authored-by: Christine Rose <[email protected]>
  • Loading branch information
6 people authored Jul 30, 2024
1 parent 3712051 commit 35bc6ab
Show file tree
Hide file tree
Showing 22 changed files with 3,489 additions and 1,413 deletions.
10 changes: 5 additions & 5 deletions .github/workflows/scrape.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,19 +42,19 @@ jobs:
- name: Build scraper
run: |
opam exec -- dune build tool/ood-gen/bin/scrape.exe
opam exec -- dune build tool/ood-gen/bin/watch_scrape.exe
- name: Run scrapers
run: |
opam exec -- dune exec tool/ood-gen/bin/scrape.exe planet
opam exec -- dune exec tool/ood-gen/bin/watch_scrape.exe > data/watch.yml
opam exec -- dune exec tool/ood-gen/bin/scrape.exe video
- name: Create Pull Request
uses: peter-evans/create-pull-request@v6
with:
title: '[scrape.yml] New OCaml Planet blog posts and videos from watch.ocaml.org'
title: '[scrape.yml] New OCaml blog posts and videos'
add-paths: |
data/watch.yml
data/video-watch.yml
data/video-youtube.yml
data/planet/*/*.md
commit-message: |
[scrape.yml] New OCaml Planet blog posts and videos from watch.ocaml.org
[scrape.yml] New OCaml Planet blog posts and videos
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ utop: ## Run a REPL and link with the project's libraries
.PHONY: scrape
scrape: ## Generate the po files
opam exec -- dune exec --root . tool/ood-gen/bin/scrape.exe planet
opam exec -- dune exec --root . tool/ood-gen/bin/watch_scrape.exe
opam exec -- dune exec --root . tool/ood-gen/bin/scrape.exe video

.PHONY: docker
docker: ## Generate docker container
Expand Down
1,688 changes: 1,688 additions & 0 deletions data/video-watch.yml

Large diffs are not rendered by default.

1,430 changes: 1,430 additions & 0 deletions data/video-youtube.yml

Large diffs are not rendered by default.

1,361 changes: 0 additions & 1,361 deletions data/watch.yml

This file was deleted.

27 changes: 27 additions & 0 deletions data/youtube.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
- name: Jane Street - Tech Talks
kind: playlist
id: PLCiAikFFaMJoWyXnJ2BWpse5HuiYibNYs
only_ocaml: true
- name: Jane Street - OCaml Unboxed
kind: playlist
id: PLCiAikFFaMJrgFrWRKn0-1EI3gVZLQJtJ
- name: Jane Street - Signal & Threads
kind: playlist
id: PLCiAikFFaMJouorRXDSfS2UoKV4BfKyQm
only_ocaml: true
- name: Emelle TV
kind: channel
id: UCvVVfCa7-nzSuCdMKXnNJNQ
- name: The Vimeagen
kind: channel
id: UCVk4b-svNJoeytrrlOixebQ
only_ocaml: true
- name: OCAML Workshop at ICFP 2023
kind: playlist
id: PLyrlk8Xaylp7Tq5-ZN6jkir-sYrhGi_0E
- name: "OCaml Programming: Correct + Efficient + Beautiful"
kind: playlist
id: PLre5AT9JnKShBOPeuiD9b-I4XROIJhkIU
- name: Collège de France - Sciences du logiciel
kind: playlist
id: PLtimy8tnozICbD45yhB7Ha_zIBJTIK3im
15 changes: 15 additions & 0 deletions dune
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,19 @@
%{target}
(run %{gen_feed} changelog))))))

(subdir
asset/
(rule
(target video.xml)
(deps
(:gen_feed %{workspace_root}/tool/ood-gen/bin/feed.exe)
%{workspace_root}/data/video-youtube.yml
%{workspace_root}/data/video-watch.yml)
(action
(chdir
%{workspace_root}
(with-stdout-to
%{target}
(run %{gen_feed} video))))))

(data_only_dirs playground data practice)
2 changes: 2 additions & 0 deletions src/ocamlorg_data/data.ml
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,8 @@ module Tutorial = struct
|> List.map fst
end

module Video = Video

module Workshop = struct
include Workshop

Expand Down
6 changes: 6 additions & 0 deletions src/ocamlorg_data/data.mli
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,12 @@ module Tutorial : sig
val search_documents : string -> search_document list
end

module Video : sig
include module type of Video

val all : t list
end

module Workshop : sig
include module type of Workshop

Expand Down
15 changes: 15 additions & 0 deletions src/ocamlorg_data/data_intf.ml
Original file line number Diff line number Diff line change
Expand Up @@ -575,6 +575,21 @@ module Tutorial = struct
[@@deriving show]
end

module Video = struct
type t = {
title : string;
content : string;
thumbnail : string;
description : string; [@default ""]
published : string;
author_name : string;
author_uri : string;
source_link : string;
source_title : string;
}
[@@deriving yaml, show]
end

module Workshop = struct
type role = [ `Co_chair | `Chair ] [@@deriving show]

Expand Down
12 changes: 12 additions & 0 deletions src/ocamlorg_data/dune
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,18 @@
%{target}
(run %{ood_gen} tool_page)))))

(rule
(target video.ml)
(deps
%{workspace_root}/data/video-watch.yml
(:ood_gen %{workspace_root}/tool/ood-gen/bin/gen.exe))
(action
(chdir
%{workspace_root}
(with-stdout-to
%{target}
(run %{ood_gen} video)))))

(rule
(target workshop.ml)
(deps
Expand Down
5 changes: 4 additions & 1 deletion src/ocamlorg_frontend/components/cards.eml
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
let image_url img =
if String.starts_with ~prefix:"http" img then img else Ocamlorg_static.Media.url ("resources/" ^ img)

let community_resource ~title ~desc ~online_url ~source_url ?(img="") () =
<div class="flex flex-col card dark:dark-card rounded-lg border border-separator_30">
<a href="<%s online_url %>" class="grow">
<div class="w-full h-44 text-white rounded-t-xl bg-cover bg-center" style="background-image: url(<%s Ocamlorg_static.Media.url ("resources/" ^ img) %>)">
<div class="w-full h-44 text-white rounded-t-xl bg-cover bg-center" style="background-image: url(<%s image_url img %>)">
</div>
<div class="flex flex-col p-5 grow">
<h2 class="text-xl font-bold text-title dark:text-dark-title mb-3"><%s title %></h2>
Expand Down
1 change: 1 addition & 0 deletions src/ocamlorg_static/dune
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
%{workspace_root}/asset/css/main.css
%{workspace_root}/asset/planet.xml
%{workspace_root}/asset/changelog.xml
%{workspace_root}/asset/video.xml
(source_tree %{workspace_root}/asset))
(action
(with-stdout-to
Expand Down
5 changes: 0 additions & 5 deletions tool/ood-gen/bin/dune
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,3 @@
(name scrape)
(modules scrape)
(libraries cmdliner ood_gen))

(executable
(name watch_scrape)
(modules watch_scrape)
(libraries yaml ezjsonm ood_gen))
1 change: 1 addition & 0 deletions tool/ood-gen/bin/feed.ml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ let term_templates =
[
("changelog", Changelog.ChangelogFeed.create_feed);
("planet", Planet.GlobalFeed.create_feed);
("video", Video.create_feed);
]

let cmds =
Expand Down
1 change: 1 addition & 0 deletions tool/ood-gen/bin/gen.ml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ let term_templates =
("tool", Tool.template);
("tool_page", Tool_page.template);
("tutorial", Tutorial.template);
("video", Video.template);
("workshops", Workshop.template);
]

Expand Down
4 changes: 3 additions & 1 deletion tool/ood-gen/bin/scrape.ml
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
open Cmdliner
open Ood_gen

let term_scrapers = [ ("planet", Ood_gen.Planet.Scraper.scrape) ]
let term_scrapers =
[ ("planet", Planet.Scraper.scrape); ("video", Video.scrape) ]

let cmds =
Cmd.group (Cmd.info "ood-scrape")
Expand Down
1 change: 0 additions & 1 deletion tool/ood-gen/bin/watch_scrape.mli

This file was deleted.

4 changes: 3 additions & 1 deletion tool/ood-gen/lib/dune
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
ocamlorg.global
ocamlorg.data_intf
cmarkit
ezjsonm
yaml
unix
ptime
Expand All @@ -18,6 +19,7 @@
fpath
ppx_stable
hilite
re)
re
xmlm)
(preprocess
(pps ppx_deriving_yaml ppx_stable ppx_deriving.show)))
43 changes: 43 additions & 0 deletions tool/ood-gen/lib/video.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
open Data_intf.Video

type video_list = t list [@@deriving yaml, show]

let all () = Youtube.all () @ Watch.all ()

let template () =
Format.asprintf {ocaml|
include Data_intf.Video
let all =%a
|ocaml}
pp_video_list (all ())

let create_entry (v : t) =
let url = Uri.of_string v.content in
let source : Syndic.Atom.source =
Syndic.Atom.source ~authors:[]
~id:(Uri.of_string v.source_link)
~title:(Syndic.Atom.Text v.source_title)
~links:[ Syndic.Atom.link (Uri.of_string v.source_link) ]
?updated:None ?categories:None ?contributors:None ?generator:None
?icon:None ?logo:None ?rights:None ?subtitle:None
in
let content = Syndic.Atom.Text v.description in
let id = url in
let authors =
(Syndic.Atom.author ~uri:(Uri.of_string v.author_uri) v.author_name, [])
in
let updated = Syndic.Date.of_rfc3339 v.published in
Syndic.Atom.entry ~content ~source ~id ~authors
~title:(Syndic.Atom.Text v.title) ~updated
~links:[ Syndic.Atom.link id ]
()

let create_feed () =
let open Rss in
() |> all
|> create_feed ~id:"video.xml" ~title:"OCaml Videos" ~create_entry ~span:3653
|> feed_to_string

let scrape () =
Youtube.scrape "data/video-youtube.yml";
Watch.scrape "data/vide-watch.yml"
77 changes: 40 additions & 37 deletions tool/ood-gen/bin/watch_scrape.ml → tool/ood-gen/lib/watch.ml
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
type watch = {
name : string;
embed_path : string;
thumbnail_path : string;
description : string option;
published_at : string;
language : string;
category : string;
}
open Ocamlorg.Import
open Data_intf.Video

type video_list = t list [@@deriving yaml, show]

let all () =
let ( let* ) = Result.bind in
let videos =
let file = "video-watch.yml" in
let* yaml = Utils.yaml_file file in
yaml |> video_list_of_yaml |> Result.map_error (Utils.where file)
in
Result.get_ok ~error:(fun (`Msg msg) -> Exn.Decode_error msg) videos

(* Extract published_at date, I believe `originallyPublishedAt` applies to
videos imported from other platforms and `publishedAt` to this videos
Expand All @@ -20,39 +24,38 @@ let get_publish_date json =
| `String s -> s
| _ -> failwith "Couldn't calculate the videos original publish date"

(* extract value of language and category *)
let get_language_category json =
let label = Ezjsonm.find json [ "label" ] in
Ezjsonm.get_string label

let get_string_or_none = function `String s -> Some s | _ -> None
let get_string_or_none = function `String s -> s | _ -> ""

let of_json json =
{
name = Ezjsonm.find json [ "name" ] |> Ezjsonm.get_string;
title = Ezjsonm.find json [ "name" ] |> Ezjsonm.get_string;
description = Ezjsonm.find json [ "description" ] |> get_string_or_none;
embed_path = Ezjsonm.find json [ "embedPath" ] |> Ezjsonm.get_string;
thumbnail_path = Ezjsonm.find json [ "thumbnailPath" ] |> Ezjsonm.get_string;
published_at = get_publish_date json;
language = Ezjsonm.find json [ "language" ] |> get_language_category;
category = Ezjsonm.find json [ "category" ] |> get_language_category;
content = Ezjsonm.find json [ "url" ] |> Ezjsonm.get_string;
thumbnail =
"https://watch.ocaml.org"
^ (Ezjsonm.find json [ "thumbnailPath" ] |> Ezjsonm.get_string);
published = get_publish_date json;
author_name = "Unknown";
author_uri = "https://watch.ocaml.org/";
source_link = "https://watch.ocaml.org/";
source_title = "Watch OCaml";
}

let watch_to_yaml t =
`O
([ ("name", `String t.name) ]
@ (match t.description with
| Some s -> [ ("description", `String s) ]
| None -> [])
@ [
("embed_path", `String t.embed_path);
("thumbnail_path", `String t.thumbnail_path);
("published_at", `String t.published_at);
("language", `String t.language);
("category", `String t.category);
])
[
("title", `String t.title);
("description", `String t.description);
("content", `String t.content);
("thumbnail", `String t.thumbnail);
("published", `String t.published);
("author_name", `String t.author_name);
("author_uri", `String t.author_uri);
("source_link", `String t.source_link);
("source_title", `String t.source_title);
]

let to_yaml t = `O [ ("watch", `A (List.map watch_to_yaml t)) ]
let to_yaml t = `A (List.map watch_to_yaml t)
let videos_url = Uri.of_string "https://watch.ocaml.org/api/v1/videos"

(* 100 is current maximum the API can return:
Expand All @@ -70,7 +73,7 @@ let get_videos ?start () =
]
in
let uri = Uri.add_query_params videos_url query_params in
let response = Ood_gen.Http_client.get_sync uri in
let response = Http_client.get_sync uri in
let body = Ezjsonm.value_from_string response in
let data = Ezjsonm.(find body [ "data" ]) in
let total = Ezjsonm.find body [ "total" ] |> Ezjsonm.get_int in
Expand All @@ -86,16 +89,16 @@ let get_all_videos () =
in
aux data

let () =
let scrape yaml_file =
let watch =
get_all_videos ()
|> List.stable_sort (fun w1 w2 -> String.compare w1.name w2.name)
|> List.stable_sort (fun w1 w2 -> String.compare w1.title w2.title)
in
let yaml = to_yaml watch in
let output =
Yaml.pp Format.str_formatter yaml;
Format.flush_str_formatter ()
in
let oc = open_out "data/watch.yml" in
let oc = open_out yaml_file in
Printf.fprintf oc "%s" output;
close_out oc
Loading

0 comments on commit 35bc6ab

Please sign in to comment.