diff --git a/app/models/parser/kwai_item.rb b/app/models/parser/kwai_item.rb index 590992a9..78eb1c32 100644 --- a/app/models/parser/kwai_item.rb +++ b/app/models/parser/kwai_item.rb @@ -1,6 +1,8 @@ module Parser class KwaiItem < Base - KWAI_URL = /^https?:\/\/([^.]+\.)?(kwai\.com|kw\.ai)\// + KWAI_ITEM_WITH_AUTHOR_URL = /^https?:\/\/([^.]+\.)?(kwai\.com|kw\.ai)\/@(?\d*\w*)\// + KWAI_ITEM_URL = /^https?:\/\/([^.]+\.)?(kwai\.com|kw\.ai)\/(\d*\w*)\// + KWAI_VIDEO_URL = /^https?:\/\/([^.]+\.)?(kwai-video\.com|kw\.ai)\// class << self def type @@ -8,7 +10,7 @@ def type end def patterns - [KWAI_URL] + [KWAI_ITEM_WITH_AUTHOR_URL, KWAI_ITEM_URL, KWAI_VIDEO_URL] end end @@ -20,7 +22,7 @@ def parse_data_for_parser(doc, _original_url, jsonld_array) jsonld = (jsonld_array.find{|item| item.dig('@type') == 'VideoObject'} || {}) title = get_kwai_text_from_tag(doc, '.info .title') - name = get_kwai_text_from_tag(doc, '.name') || jsonld.dig('creator','name')&.strip + name = get_kwai_text_from_tag(doc, '.name') || jsonld.dig('creator','name')&.strip || match_username(url) description = get_kwai_text_from_tag(doc, '.info .title') || jsonld.dig('transcript')&.strip || jsonld.dig('description')&.strip @parsed_data.merge!({ title: title, @@ -36,5 +38,9 @@ def parse_data_for_parser(doc, _original_url, jsonld_array) def get_kwai_text_from_tag(doc, selector) doc&.at_css(selector)&.text&.to_s&.strip end + + def match_username(url) + if url.match(KWAI_ITEM_WITH_AUTHOR_URL) then url.match(KWAI_ITEM_WITH_AUTHOR_URL)['username'] end + end end end diff --git a/test/models/parser/kwai_test.rb b/test/models/parser/kwai_test.rb index fad3139e..4a6cc047 100644 --- a/test/models/parser/kwai_test.rb +++ b/test/models/parser/kwai_test.rb @@ -35,7 +35,16 @@ def teardown assert_equal true, match_one.is_a?(Parser::KwaiItem) match_two = Parser::KwaiItem.match?('https://m.kwai.com/photo/150000228494834/5222636779124848117') assert_equal true, match_two.is_a?(Parser::KwaiItem) + match_three = Parser::KwaiItem.match?('https://kwai-video.com/p/6UCtAajG') + assert_equal true, match_three.is_a?(Parser::KwaiItem) + match_four = Parser::KwaiItem.match?('https://www.kwai.com/@AnonymouSScobar/video/5217288797260590112?page_source=guest_profile') + assert_equal true, match_four.is_a?(Parser::KwaiItem) end + + test "does not match kwai profile URL" do + match_five = Parser::PageItem.match?('https://www.kwai.com/@AnonymouSScobar') + assert_equal false, match_five.is_a?(Parser::KwaiItem) + end test "assigns values to hash from the HTML doc" do doc = response_fixture_from_file('kwai-page.html', parse_as: :html) @@ -60,16 +69,24 @@ def teardown test "assigns values to hash from the json+ld and falls back to url as title" do doc = Nokogiri::HTML(<<~HTML) - + HTML - WebMock.stub_request(:any, 'https://www.kwai.com/fakelink').to_return(status: 200, body: doc.to_s) + WebMock.stub_request(:any, 'https://www.kwai.com/@fakeuser/111111111').to_return(status: 200, body: doc.to_s) - media = Media.new(url: 'https://www.kwai.com/fakelink') + media = Media.new(url: 'https://www.kwai.com/@fakeuser/111111111') data = media.as_json assert_equal 'video transcript', data['description'] - assert_equal 'https://www.kwai.com/fakelink', data['title'] + assert_equal 'https://www.kwai.com/@fakeuser/111111111', data['title'] assert_equal 'Fake User', data['author_name'] end + + test "fallbacks to the username on the url when doc and json+ld are not present, if name is present in the url" do + empty_doc = Nokogiri::HTML('') + + data = Parser::KwaiItem.new('https://www.kwai.com/@fakeuser/111111111').parse_data(empty_doc, 'https://www.kwai.com/@fakeuser/111111111') + + assert_equal 'fakeuser', data['author_name'] + end end