Skip to content

Commit

Permalink
Merge pull request #593 from internetarchive/adam/add-crawl-log-loggi…
Browse files Browse the repository at this point in the history
…ng-to-extractoryoutubedl

feat: Add logging to crawl.log for metadata records created by ExtractorYoutubeDL
  • Loading branch information
adam-miller authored Aug 7, 2024
2 parents 3ae1300 + 59635f0 commit b22d6ce
Show file tree
Hide file tree
Showing 7 changed files with 221 additions and 3 deletions.
5 changes: 5 additions & 0 deletions contrib/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,11 @@
<artifactId>Java-WebSocket</artifactId>
<version>1.5.2</version>
</dependency>
<dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
<version>1.16.1</version>
</dependency>
</dependencies>
<repositories>
<repository>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.RandomAccessFile;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
Expand All @@ -43,7 +44,9 @@
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.httpclient.URIException;
import org.archive.crawler.framework.CrawlController;
import org.archive.crawler.frontier.AMQPUrlReceiver;
import org.archive.crawler.reporting.CrawlerLoggerModule;
import org.archive.format.warc.WARCConstants.WARCRecordType;
Expand All @@ -52,8 +55,6 @@
import org.archive.modules.CrawlURI;
import org.archive.modules.warc.BaseWARCRecordBuilder;
import org.archive.modules.warc.WARCRecordBuilder;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.archive.util.ArchiveUtils;
import org.archive.util.MimetypeUtils;
import org.springframework.beans.factory.annotation.Autowired;
Expand Down Expand Up @@ -109,6 +110,7 @@ public class ExtractorYoutubeDL extends Extractor
protected static final String YDL_CONTAINING_PAGE_DIGEST = "ydl-containing-page-digest";
protected static final String YDL_CONTAINING_PAGE_TIMESTAMP = "ydl-containing-page-timestamp";
protected static final String YDL_CONTAINING_PAGE_URI = "ydl-containing-page-uri";
protected static final String YDL_JSON_FILE_DIGEST = "ydl-json-file-digest";

protected static final int MAX_VIDEOS_PER_PAGE = 1000;

Expand Down Expand Up @@ -173,6 +175,25 @@ public void setCrawlerLoggerModule(CrawlerLoggerModule crawlerLoggerModule) {
this.crawlerLoggerModule = crawlerLoggerModule;
}

@Autowired
protected CrawlController controller;
public void setCrawlController(CrawlController controller) {
this.controller = controller;
}

{
setLogMetadataRecord(true);
}
public boolean getLogMetadataRecord() {
return (Boolean) kp.get("logMetadataRecord");
}
/**
* Whether or not to create a crawl.log entry for any WARC Metadata Records written.
*/
public void setLogMetadataRecord(boolean logMetadataRecord) {
kp.put("logMetadataRecord",logMetadataRecord);
}

@Override
public void start() {
if (!isRunning) {
Expand Down Expand Up @@ -372,6 +393,12 @@ public int read(byte b[], int off, int len) throws IOException {
}
}

/** Dummy output stream to swallow bytes without storing anything. */
public class NullOutputStream extends OutputStream {
@Override
public void write(int b) throws IOException {}
}

/**
* Streams through youtube-dl json output. Sticks video urls in
* <code>results.videoUrls</code>, web page urls in
Expand Down Expand Up @@ -583,6 +610,10 @@ public WARCRecordInfo buildRecord(CrawlURI curi, URI concurrentTo)

getLocalTempFile().seek(0);
InputStream inputStream = Channels.newInputStream(getLocalTempFile().getChannel());
curi.getData().put(YDL_JSON_FILE_DIGEST, DigestUtils.sha1(inputStream));
//Leave InputStream open for warc writer to handle

getLocalTempFile().seek(0);
recordInfo.setContentStream(inputStream);
recordInfo.setContentLength(getLocalTempFile().length());

Expand All @@ -591,6 +622,43 @@ public WARCRecordInfo buildRecord(CrawlURI curi, URI concurrentTo)
return recordInfo;
}

/**
* Because we are writing an additional WARC Metadata Record for the json video info, there is no CrawlURI for that
* record, and thus nothing ever goes through the frontier to be logged to the crawl.log. To log this capture we
* Create a CrawlURI <code>pseudoCuri</code> object and assign the appropriate values and then call to the logger.
*
* @param recordInfo WARCRecordInfo object that was just written
* @param curi CrawlURI that generated the WARCRecordInfo Object
*/
@Override
public void postWrite(WARCRecordInfo recordInfo, CrawlURI curi) {
if(!this.getLogMetadataRecord())
return;

CrawlURI pseudoCuri = null;
try {
pseudoCuri = curi.createCrawlURI(recordInfo.getUrl(), LinkContext.EMBED_MISC, Hop.INFERRED);

pseudoCuri.getAnnotations().add("youtube-dl:");
pseudoCuri.setThreadNumber(curi.getThreadNumber());
pseudoCuri.setContentSize(recordInfo.getContentLength());
pseudoCuri.setContentType(recordInfo.getMimetype());
pseudoCuri.addExtraInfo("warcFilename", recordInfo.getWARCFilename());
pseudoCuri.addExtraInfo("warcFileOffset", recordInfo.getWARCFileOffset());
pseudoCuri.setFetchStatus(204);
pseudoCuri.setContentDigest("sha1",(byte[])curi.getData().get(YDL_JSON_FILE_DIGEST));
pseudoCuri.addExtraInfo("contentSize", recordInfo.getContentLength());

Object array[] = {pseudoCuri};
this.controller.getLoggerModule().getUriProcessing().log(Level.INFO,
curi.getUURI().toString(), array);
} catch (URIException e) {
logger.log(Level.WARNING, "Exception while parsing UURI for youtube-dl metadata record " + recordInfo.getUrl(), e);
} catch (IOException e) {
logger.log(Level.WARNING, "Exception while generating digest for youtube-dl metadata record " + recordInfo.getUrl(), e);
}
}

public static void main(String[] args) throws IOException {
/*
File t = File.createTempFile("ydl", ".json");
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
package org.archive.modules.extractor;

import org.apache.commons.io.IOUtils;
import org.archive.crawler.framework.CrawlController;
import org.archive.crawler.io.UriProcessingFormatter;
import org.archive.crawler.reporting.CrawlerLoggerModule;
import org.archive.format.warc.WARCConstants;
import org.archive.io.warc.WARCRecordInfo;
import org.archive.modules.CrawlURI;

import java.io.InputStream;
import java.io.OutputStream;
import java.nio.channels.Channels;
import java.util.ArrayList;
import java.util.logging.Handler;
import java.util.logging.Level;
import java.util.logging.LogRecord;
import java.util.logging.Logger;

public class ExtractorYoutubeDLTest extends ContentExtractorTestBase {

protected String getTestUri() {
return "https://www.youtube.com/watch?v=i08NNO-DPgg";
}
protected String getTestResourceFileName() {
return "ExtractorYoutubeDL.json";
}
protected String getTestResourceSha1() { return "WFD7RIFCGNFVAWBEWLF6T2HXPXDEZY45"; }

/**
* Test that we have the expected WARC Metadata Record given a json output from yt-dlp
* @throws Exception
*/
public void testBuildRecord() throws Exception {
CrawlURI testUri = CrawlURI.fromHopsViaString(getTestUri());
InputStream is = this.getClass().getClassLoader().getResourceAsStream(getTestResourceFileName());
byte[] json_results = IOUtils.toByteArray(is);
ExtractorYoutubeDL ex = (ExtractorYoutubeDL)extractor;
OutputStream os = Channels.newOutputStream(ex.getLocalTempFile().getChannel());
IOUtils.write(json_results, os);
WARCRecordInfo record = ex.buildRecord(testUri, null);

assertEquals(record.getUrl(),"youtube-dl:" + getTestUri());
assertEquals(record.getType(), WARCConstants.WARCRecordType.metadata);
assertEquals(record.getMimetype(),"application/vnd.youtube-dl_formats+json;charset=utf-8");

//Test input file is the same content as the content to be written to warc
byte[] output_array = IOUtils.toByteArray(record.getContentStream());
long json_len = json_results.length;
long out_len = output_array.length;
org.junit.Assert.assertArrayEquals(json_results, output_array);
}

/**
* Test that the resuling log line is as expected, and the resulting hash string matches
* @throws Exception
*/
public void testPostWrite() throws Exception {

CrawlURI testUri = CrawlURI.fromHopsViaString(getTestUri());
InputStream is = this.getClass().getClassLoader().getResourceAsStream(getTestResourceFileName());
byte[] json_results = IOUtils.toByteArray(is);
ExtractorYoutubeDL ex = (ExtractorYoutubeDL)extractor;
OutputStream os = Channels.newOutputStream(ex.getLocalTempFile().getChannel());
IOUtils.write(json_results, os);

WARCRecordInfo record = ex.buildRecord(testUri, null);

ex.controller.setLoggerModule(new CrawlerLoggerModule() {
@Override
public Logger getUriProcessing() {
Logger logger = Logger.getLogger(ExtractorYoutubeDL.class.getName());

logger.setLevel(Level.ALL);
return logger;
}
});
Logger logger = ex.controller.getLoggerModule().getUriProcessing();
TestLogHandler logHandler = new TestLogHandler();
logger.addHandler(logHandler);
UriProcessingFormatter formatter = new UriProcessingFormatter(true);

ex.setLogMetadataRecord(false);
ex.postWrite(record, testUri);
assert(logHandler.getLines().length == 0);

ex.setLogMetadataRecord(true);
ex.postWrite(record, testUri);
LogRecord[] logLines = logHandler.getLines();
assert(logHandler.getLines().length>0);
String message = formatter.format(logHandler.getLines()[0]);
String expected_crawl_log_line = " 204 434699 youtube-dl:https://www.youtube.com/watch?v=i08NNO-DPgg I https://www.youtube.com/watch?v=i08NNO-DPgg application/vnd.youtube-dl_formats+json #000 - sha1:WFD7RIFCGNFVAWBEWLF6T2HXPXDEZY45 - youtube-dl: {\"contentSize\":434699}";
assert(message.contains(expected_crawl_log_line));



}

@Override
protected Extractor makeExtractor() {
CrawlController controller = new CrawlController();
ExtractorYoutubeDL ex = new ExtractorYoutubeDL();
ex.setCrawlController(controller);

UriErrorLoggerModule ulm = new UnitTestUriLoggerModule();
ex.setLoggerModule(ulm);

return ex;
}
}

/**
* Helper class to let us inspect the individual LogRecords
*/
class TestLogHandler extends Handler
{
ArrayList<LogRecord> logLines;
public TestLogHandler() {
super();
this.logLines = new ArrayList<LogRecord>();
}

public LogRecord[] getLines() {
return this.logLines.toArray(new LogRecord[]{});
}

public void publish(LogRecord record) {
this.logLines.add(record);
}

public void close(){}
public void flush(){}
}
1 change: 1 addition & 0 deletions contrib/src/test/resources/ExtractorYoutubeDL.json

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
package org.archive.modules.warc;

import org.archive.io.warc.WARCRecordInfo;
import org.archive.modules.CrawlURI;

import java.net.URI;
import java.net.URISyntaxException;
import java.util.UUID;
Expand All @@ -12,4 +15,7 @@ public static URI generateRecordID() {
throw new RuntimeException(e); // impossible
}
}
public void postWrite(WARCRecordInfo recordInfo, CrawlURI curi) {
return;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,6 @@ public interface WARCRecordBuilder {
WARCRecordInfo buildRecord(CrawlURI curi, URI concurrentTo)
throws IOException;

}
void postWrite(WARCRecordInfo warcRecordInfo, CrawlURI curi);

}
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,9 @@ protected void writeRecords(CrawlURI curi, WARCWriter writer) throws IOException
if (recordBuilder.shouldBuildRecord(curi)) {
WARCRecordInfo record = recordBuilder.buildRecord(curi, concurrentTo);
if (record != null) {
record.setWARCFileOffset(writer.getPosition());
writer.writeRecord(record);
record.setWARCFilename(writer.getFilenameWithoutOccupiedSuffix());
InputStream is = null;
try {
is = record.getContentStream();
Expand All @@ -178,6 +180,7 @@ protected void writeRecords(CrawlURI curi, WARCWriter writer) throws IOException
if (concurrentTo == null) {
concurrentTo = record.getRecordId();
}
recordBuilder.postWrite(record, curi);
}
}
}
Expand Down

0 comments on commit b22d6ce

Please sign in to comment.