Skip to content

Commit

Permalink
update regex in the kwai item (#390)
Browse files Browse the repository at this point in the history
* update regex in the kwai item

We want to make sure kwai profiles are not being parsed by the item parser.
Until we add a profile item for kwai they will be parsed by page.

Notes:
- I added a regex for when we have a kwai-video host instead of only kwai.
- When we have the username in the url, we are getting it as a fallback.
  • Loading branch information
vasconsaurus authored Sep 12, 2023
1 parent 701bdf4 commit 2224ba1
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 7 deletions.
12 changes: 9 additions & 3 deletions app/models/parser/kwai_item.rb
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
module Parser
class KwaiItem < Base
KWAI_URL = /^https?:\/\/([^.]+\.)?(kwai\.com|kw\.ai)\//
KWAI_ITEM_WITH_AUTHOR_URL = /^https?:\/\/([^.]+\.)?(kwai\.com|kw\.ai)\/@(?<username>\d*\w*)\//
KWAI_ITEM_URL = /^https?:\/\/([^.]+\.)?(kwai\.com|kw\.ai)\/(\d*\w*)\//
KWAI_VIDEO_URL = /^https?:\/\/([^.]+\.)?(kwai-video\.com|kw\.ai)\//

class << self
def type
'kwai_item'.freeze
end

def patterns
[KWAI_URL]
[KWAI_ITEM_WITH_AUTHOR_URL, KWAI_ITEM_URL, KWAI_VIDEO_URL]
end
end

Expand All @@ -20,7 +22,7 @@ def parse_data_for_parser(doc, _original_url, jsonld_array)
jsonld = (jsonld_array.find{|item| item.dig('@type') == 'VideoObject'} || {})

title = get_kwai_text_from_tag(doc, '.info .title')
name = get_kwai_text_from_tag(doc, '.name') || jsonld.dig('creator','name')&.strip
name = get_kwai_text_from_tag(doc, '.name') || jsonld.dig('creator','name')&.strip || match_username(url)
description = get_kwai_text_from_tag(doc, '.info .title') || jsonld.dig('transcript')&.strip || jsonld.dig('description')&.strip
@parsed_data.merge!({
title: title,
Expand All @@ -36,5 +38,9 @@ def parse_data_for_parser(doc, _original_url, jsonld_array)
def get_kwai_text_from_tag(doc, selector)
doc&.at_css(selector)&.text&.to_s&.strip
end

def match_username(url)
if url.match(KWAI_ITEM_WITH_AUTHOR_URL) then url.match(KWAI_ITEM_WITH_AUTHOR_URL)['username'] end
end
end
end
25 changes: 21 additions & 4 deletions test/models/parser/kwai_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,16 @@ def teardown
assert_equal true, match_one.is_a?(Parser::KwaiItem)
match_two = Parser::KwaiItem.match?('https://m.kwai.com/photo/150000228494834/5222636779124848117')
assert_equal true, match_two.is_a?(Parser::KwaiItem)
match_three = Parser::KwaiItem.match?('https://kwai-video.com/p/6UCtAajG')
assert_equal true, match_three.is_a?(Parser::KwaiItem)
match_four = Parser::KwaiItem.match?('https://www.kwai.com/@AnonymouSScobar/video/5217288797260590112?page_source=guest_profile')
assert_equal true, match_four.is_a?(Parser::KwaiItem)
end

test "does not match kwai profile URL" do
match_five = Parser::PageItem.match?('https://www.kwai.com/@AnonymouSScobar')
assert_equal false, match_five.is_a?(Parser::KwaiItem)
end

test "assigns values to hash from the HTML doc" do
doc = response_fixture_from_file('kwai-page.html', parse_as: :html)
Expand All @@ -60,16 +69,24 @@ def teardown

test "assigns values to hash from the json+ld and falls back to url as title" do
doc = Nokogiri::HTML(<<~HTML)
<script data-n-head="ssr" type="application/ld+json" id="VideoObject">{"url":"https://www.kwai.com/fakelink","name":"Fake User. Áudio original criado por Fake User. ","description":"#tag1 #tag2 #tag3","transcript":"video transcript","thumbnailUrl":["http://ak-br-pic.kwai.net/kimg/fake_thumbnail.webp"],"uploadDate":"2022-04-03 19:33:22","contentUrl":"https://cloudflare-br-cdn.kwai.net/upic/2022/04/03/19/fake_video.mp4?tag=1-1694090789-s-0-lm1vo6rkom-4c561f7187b6ac1b","commentCount":3568,"duration":"PT27S","width":612,"height":544,"audio":{"name":"Áudio original criado por Fake User","author":"Fake User","@type":"CreativeWork"},"creator":{"name":"Fake User","image":"https://aws-br-pic.kwai.net/bs2/overseaHead/fake_image.jpg","description":"criador de conteúdo","alternateName":"fakeuser","url":"https://www.kwai.com/@fakeuser","interactionStatistic":[{"userInteractionCount":449001,"interactionType":{"@type":"http://schema.org/LikeAction"},"@type":"InteractionCounter"},{"userInteractionCount":33974,"interactionType":{"@type":"http://schema.org/FollowAction"},"@type":"InteractionCounter"}],"mainEntityOfPage":{"@id":"https://www.kwai.com/@wdklv443","@type":"ProfilePage"},"@type":"Person"},"interactionStatistic":[{"userInteractionCount":163968,"interactionType":{"@type":"http://schema.org/WatchAction"},"@type":"InteractionCounter"},{"userInteractionCount":10489,"interactionType":{"@type":"http://schema.org/LikeAction"},"@type":"InteractionCounter"},{"userInteractionCount":11899,"interactionType":{"@type":"http://schema.org/ShareAction"},"@type":"InteractionCounter"}],"mainEntityOfPage":{"@id":"https://www.kwai.com/fakelink","@type":"ItemPage"},"@context":"https://schema.org/","@type":"VideoObject"}</script>
<script data-n-head="ssr" type="application/ld+json" id="VideoObject">{"url":"https://www.kwai.com/@fakeuser/111111111","name":"Fake User. Áudio original criado por Fake User. ","description":"#tag1 #tag2 #tag3","transcript":"video transcript","thumbnailUrl":["http://ak-br-pic.kwai.net/kimg/fake_thumbnail.webp"],"uploadDate":"2022-04-03 19:33:22","contentUrl":"https://cloudflare-br-cdn.kwai.net/upic/2022/04/03/19/fake_video.mp4?tag=1-1694090789-s-0-lm1vo6rkom-4c561f7187b6ac1b","commentCount":3568,"duration":"PT27S","width":612,"height":544,"audio":{"name":"Áudio original criado por Fake User","author":"Fake User","@type":"CreativeWork"},"creator":{"name":"Fake User","image":"https://aws-br-pic.kwai.net/bs2/overseaHead/fake_image.jpg","description":"criador de conteúdo","alternateName":"fakeuser","url":"https://www.kwai.com/@fakeuser","interactionStatistic":[{"userInteractionCount":449001,"interactionType":{"@type":"http://schema.org/LikeAction"},"@type":"InteractionCounter"},{"userInteractionCount":33974,"interactionType":{"@type":"http://schema.org/FollowAction"},"@type":"InteractionCounter"}],"mainEntityOfPage":{"@id":"https://www.kwai.com/@wdklv443","@type":"ProfilePage"},"@type":"Person"},"interactionStatistic":[{"userInteractionCount":163968,"interactionType":{"@type":"http://schema.org/WatchAction"},"@type":"InteractionCounter"},{"userInteractionCount":10489,"interactionType":{"@type":"http://schema.org/LikeAction"},"@type":"InteractionCounter"},{"userInteractionCount":11899,"interactionType":{"@type":"http://schema.org/ShareAction"},"@type":"InteractionCounter"}],"mainEntityOfPage":{"@id":"https://www.kwai.com/@fakeuser/111111111","@type":"ItemPage"},"@context":"https://schema.org/","@type":"VideoObject"}</script>
HTML

WebMock.stub_request(:any, 'https://www.kwai.com/fakelink').to_return(status: 200, body: doc.to_s)
WebMock.stub_request(:any, 'https://www.kwai.com/@fakeuser/111111111').to_return(status: 200, body: doc.to_s)

media = Media.new(url: 'https://www.kwai.com/fakelink')
media = Media.new(url: 'https://www.kwai.com/@fakeuser/111111111')
data = media.as_json

assert_equal 'video transcript', data['description']
assert_equal 'https://www.kwai.com/fakelink', data['title']
assert_equal 'https://www.kwai.com/@fakeuser/111111111', data['title']
assert_equal 'Fake User', data['author_name']
end

test "fallbacks to the username on the url when doc and json+ld are not present, if name is present in the url" do
empty_doc = Nokogiri::HTML('')

data = Parser::KwaiItem.new('https://www.kwai.com/@fakeuser/111111111').parse_data(empty_doc, 'https://www.kwai.com/@fakeuser/111111111')

assert_equal 'fakeuser', data['author_name']
end
end

0 comments on commit 2224ba1

Please sign in to comment.