Merge pull request #593 from internetarchive/adam/add-crawl-log-loggi…

…ng-to-extractoryoutubedl feat: Add logging to crawl.log for metadata records created by ExtractorYoutubeDL
internetarchive · Aug 7, 2024 · b22d6ce · b22d6ce
2 parents 3ae1300 + 59635f0
commit b22d6ce
Show file tree

Hide file tree

Showing 7 changed files with 221 additions and 3 deletions.
diff --git a/contrib/pom.xml b/contrib/pom.xml
@@ -88,6 +88,11 @@
 			<artifactId>Java-WebSocket</artifactId>
 			<version>1.5.2</version>
 		</dependency>
+		<dependency>
+			<groupId>commons-codec</groupId>
+			<artifactId>commons-codec</artifactId>
+			<version>1.16.1</version>
+		</dependency>
 	</dependencies>
 	<repositories>
 		<repository>

diff --git a/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java b/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java
@@ -27,6 +27,7 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
+import java.io.OutputStream;
 import java.io.RandomAccessFile;
 import java.io.Reader;
 import java.io.UnsupportedEncodingException;
@@ -43,7 +44,9 @@
 import java.util.logging.Level;
 import java.util.logging.Logger;
 
+import org.apache.commons.codec.digest.DigestUtils;
 import org.apache.commons.httpclient.URIException;
+import org.archive.crawler.framework.CrawlController;
 import org.archive.crawler.frontier.AMQPUrlReceiver;
 import org.archive.crawler.reporting.CrawlerLoggerModule;
 import org.archive.format.warc.WARCConstants.WARCRecordType;
@@ -52,8 +55,6 @@
 import org.archive.modules.CrawlURI;
 import org.archive.modules.warc.BaseWARCRecordBuilder;
 import org.archive.modules.warc.WARCRecordBuilder;
-import org.archive.net.UURI;
-import org.archive.net.UURIFactory;
 import org.archive.util.ArchiveUtils;
 import org.archive.util.MimetypeUtils;
 import org.springframework.beans.factory.annotation.Autowired;
@@ -109,6 +110,7 @@ public class ExtractorYoutubeDL extends Extractor
     protected static final String YDL_CONTAINING_PAGE_DIGEST = "ydl-containing-page-digest";
     protected static final String YDL_CONTAINING_PAGE_TIMESTAMP = "ydl-containing-page-timestamp";
     protected static final String YDL_CONTAINING_PAGE_URI = "ydl-containing-page-uri";
+    protected static final String YDL_JSON_FILE_DIGEST = "ydl-json-file-digest";
 
     protected static final int MAX_VIDEOS_PER_PAGE = 1000;
 
@@ -173,6 +175,25 @@ public void setCrawlerLoggerModule(CrawlerLoggerModule crawlerLoggerModule) {
         this.crawlerLoggerModule = crawlerLoggerModule;
     }
 
+    @Autowired
+    protected CrawlController controller;
+    public void setCrawlController(CrawlController controller) {
+        this.controller = controller;
+    }
+
+    {
+        setLogMetadataRecord(true);
+    }
+    public boolean getLogMetadataRecord() {
+        return (Boolean) kp.get("logMetadataRecord");
+    }
+    /**
+     * Whether or not to create a crawl.log entry for any WARC Metadata Records written.
+     */
+    public void setLogMetadataRecord(boolean logMetadataRecord) {
+        kp.put("logMetadataRecord",logMetadataRecord);
+    }
+
     @Override
     public void start() {
         if (!isRunning) {
@@ -372,6 +393,12 @@ public int read(byte b[], int off, int len) throws IOException {
         }
     }
 
+    /** Dummy output stream to swallow bytes without storing anything. */
+    public class NullOutputStream extends OutputStream {
+        @Override
+        public void write(int b) throws IOException {}
+    }
+
     /**
      * Streams through youtube-dl json output. Sticks video urls in
      * <code>results.videoUrls</code>, web page urls in
@@ -583,6 +610,10 @@ public WARCRecordInfo buildRecord(CrawlURI curi, URI concurrentTo)
 
         getLocalTempFile().seek(0);
         InputStream inputStream = Channels.newInputStream(getLocalTempFile().getChannel());
+        curi.getData().put(YDL_JSON_FILE_DIGEST, DigestUtils.sha1(inputStream));
+        //Leave InputStream open for warc writer to handle
+
+        getLocalTempFile().seek(0);
         recordInfo.setContentStream(inputStream);
         recordInfo.setContentLength(getLocalTempFile().length());
 
@@ -591,6 +622,43 @@ public WARCRecordInfo buildRecord(CrawlURI curi, URI concurrentTo)
         return recordInfo;
     }
 
+    /**
+     * Because we are writing an additional WARC Metadata Record for the json video info, there is no CrawlURI for that
+     * record, and thus nothing ever goes through the frontier to be logged to the crawl.log. To log this capture we
+     * Create a CrawlURI <code>pseudoCuri</code> object and assign the appropriate values and then call to the logger.
+     *
+     * @param recordInfo WARCRecordInfo object that was just written
+     * @param curi CrawlURI that generated the WARCRecordInfo Object
+     */
+    @Override
+    public void postWrite(WARCRecordInfo recordInfo, CrawlURI curi) {
+        if(!this.getLogMetadataRecord())
+            return;
+
+        CrawlURI pseudoCuri = null;
+        try {
+            pseudoCuri = curi.createCrawlURI(recordInfo.getUrl(), LinkContext.EMBED_MISC, Hop.INFERRED);
+
+            pseudoCuri.getAnnotations().add("youtube-dl:");
+            pseudoCuri.setThreadNumber(curi.getThreadNumber());
+            pseudoCuri.setContentSize(recordInfo.getContentLength());
+            pseudoCuri.setContentType(recordInfo.getMimetype());
+            pseudoCuri.addExtraInfo("warcFilename", recordInfo.getWARCFilename());
+            pseudoCuri.addExtraInfo("warcFileOffset", recordInfo.getWARCFileOffset());
+            pseudoCuri.setFetchStatus(204);
+            pseudoCuri.setContentDigest("sha1",(byte[])curi.getData().get(YDL_JSON_FILE_DIGEST));
+            pseudoCuri.addExtraInfo("contentSize", recordInfo.getContentLength());
+
+            Object array[] = {pseudoCuri};
+            this.controller.getLoggerModule().getUriProcessing().log(Level.INFO,
+                    curi.getUURI().toString(), array);
+        } catch (URIException e) {
+            logger.log(Level.WARNING, "Exception while parsing UURI for youtube-dl metadata record " + recordInfo.getUrl(), e);
+        } catch (IOException e) {
+            logger.log(Level.WARNING, "Exception while generating digest for youtube-dl metadata record " + recordInfo.getUrl(), e);
+        }
+    }
+
     public static void main(String[] args) throws IOException {
         /*
         File t = File.createTempFile("ydl", ".json");

diff --git a/contrib/src/test/java/org/archive/modules/extractor/ExtractorYoutubeDLTest.java b/contrib/src/test/java/org/archive/modules/extractor/ExtractorYoutubeDLTest.java
@@ -0,0 +1,133 @@
+package org.archive.modules.extractor;
+
+import org.apache.commons.io.IOUtils;
+import org.archive.crawler.framework.CrawlController;
+import org.archive.crawler.io.UriProcessingFormatter;
+import org.archive.crawler.reporting.CrawlerLoggerModule;
+import org.archive.format.warc.WARCConstants;
+import org.archive.io.warc.WARCRecordInfo;
+import org.archive.modules.CrawlURI;
+
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.channels.Channels;
+import java.util.ArrayList;
+import java.util.logging.Handler;
+import java.util.logging.Level;
+import java.util.logging.LogRecord;
+import java.util.logging.Logger;
+
+public class ExtractorYoutubeDLTest extends ContentExtractorTestBase {
+
+    protected String getTestUri() {
+        return "https://www.youtube.com/watch?v=i08NNO-DPgg";
+    }
+    protected String getTestResourceFileName() {
+        return "ExtractorYoutubeDL.json";
+    }
+    protected String getTestResourceSha1() { return "WFD7RIFCGNFVAWBEWLF6T2HXPXDEZY45"; }
+
+    /**
+     * Test that we have the expected WARC Metadata Record given a json output from yt-dlp
+     * @throws Exception
+     */
+    public void testBuildRecord() throws Exception {
+        CrawlURI testUri = CrawlURI.fromHopsViaString(getTestUri());
+        InputStream is = this.getClass().getClassLoader().getResourceAsStream(getTestResourceFileName());
+        byte[] json_results = IOUtils.toByteArray(is);
+        ExtractorYoutubeDL ex = (ExtractorYoutubeDL)extractor;
+        OutputStream os = Channels.newOutputStream(ex.getLocalTempFile().getChannel());
+        IOUtils.write(json_results, os);
+        WARCRecordInfo record = ex.buildRecord(testUri, null);
+
+        assertEquals(record.getUrl(),"youtube-dl:" + getTestUri());
+        assertEquals(record.getType(), WARCConstants.WARCRecordType.metadata);
+        assertEquals(record.getMimetype(),"application/vnd.youtube-dl_formats+json;charset=utf-8");
+
+        //Test input file is the same content as the content to be written to warc
+        byte[] output_array = IOUtils.toByteArray(record.getContentStream());
+        long json_len = json_results.length;
+        long out_len = output_array.length;
+        org.junit.Assert.assertArrayEquals(json_results, output_array);
+    }
+
+    /**
+     * Test that the resuling log line is as expected, and the resulting hash string matches
+     * @throws Exception
+     */
+    public void testPostWrite() throws Exception {
+
+        CrawlURI testUri = CrawlURI.fromHopsViaString(getTestUri());
+        InputStream is = this.getClass().getClassLoader().getResourceAsStream(getTestResourceFileName());
+        byte[] json_results = IOUtils.toByteArray(is);
+        ExtractorYoutubeDL ex = (ExtractorYoutubeDL)extractor;
+        OutputStream os = Channels.newOutputStream(ex.getLocalTempFile().getChannel());
+        IOUtils.write(json_results, os);
+
+        WARCRecordInfo record = ex.buildRecord(testUri, null);
+
+        ex.controller.setLoggerModule(new CrawlerLoggerModule() {
+            @Override
+            public Logger getUriProcessing() {
+                Logger logger = Logger.getLogger(ExtractorYoutubeDL.class.getName());
+
+                logger.setLevel(Level.ALL);
+                return logger;
+            }
+        });
+        Logger logger = ex.controller.getLoggerModule().getUriProcessing();
+        TestLogHandler logHandler = new TestLogHandler();
+        logger.addHandler(logHandler);
+        UriProcessingFormatter formatter = new UriProcessingFormatter(true);
+
+        ex.setLogMetadataRecord(false);
+        ex.postWrite(record, testUri);
+        assert(logHandler.getLines().length == 0);
+
+        ex.setLogMetadataRecord(true);
+        ex.postWrite(record, testUri);
+        LogRecord[] logLines = logHandler.getLines();
+        assert(logHandler.getLines().length>0);
+        String message = formatter.format(logHandler.getLines()[0]);
+        String expected_crawl_log_line = "   204     434699 youtube-dl:https://www.youtube.com/watch?v=i08NNO-DPgg I https://www.youtube.com/watch?v=i08NNO-DPgg application/vnd.youtube-dl_formats+json #000 - sha1:WFD7RIFCGNFVAWBEWLF6T2HXPXDEZY45 - youtube-dl: {\"contentSize\":434699}";
+        assert(message.contains(expected_crawl_log_line));
+
+
+
+    }
+
+    @Override
+    protected Extractor makeExtractor() {
+        CrawlController controller = new CrawlController();
+        ExtractorYoutubeDL ex = new ExtractorYoutubeDL();
+        ex.setCrawlController(controller);
+
+        UriErrorLoggerModule ulm = new UnitTestUriLoggerModule();
+        ex.setLoggerModule(ulm);
+
+        return ex;
+    }
+}
+
+/**
+ * Helper class to let us inspect the individual LogRecords
+ */
+class TestLogHandler extends Handler
+{
+    ArrayList<LogRecord> logLines;
+    public TestLogHandler() {
+        super();
+        this.logLines = new ArrayList<LogRecord>();
+    }
+
+    public LogRecord[] getLines() {
+        return this.logLines.toArray(new LogRecord[]{});
+    }
+
+    public void publish(LogRecord record) {
+        this.logLines.add(record);
+    }
+
+    public void close(){}
+    public void flush(){}
+}
diff --git a/contrib/src/test/resources/ExtractorYoutubeDL.json b/contrib/src/test/resources/ExtractorYoutubeDL.json
diff --git a/modules/src/main/java/org/archive/modules/warc/BaseWARCRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/BaseWARCRecordBuilder.java
@@ -1,5 +1,8 @@
 package org.archive.modules.warc;
 
+import org.archive.io.warc.WARCRecordInfo;
+import org.archive.modules.CrawlURI;
+
 import java.net.URI;
 import java.net.URISyntaxException;
 import java.util.UUID;
@@ -12,4 +15,7 @@ public static URI generateRecordID() {
             throw new RuntimeException(e); // impossible 
         }
     }
+    public void postWrite(WARCRecordInfo recordInfo, CrawlURI curi) {
+        return;
+    }
 }
diff --git a/modules/src/main/java/org/archive/modules/warc/WARCRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/WARCRecordBuilder.java
@@ -43,4 +43,6 @@ public interface WARCRecordBuilder {
     WARCRecordInfo buildRecord(CrawlURI curi, URI concurrentTo)
             throws IOException;
 
-}
+    void postWrite(WARCRecordInfo warcRecordInfo, CrawlURI curi);
+
+}
diff --git a/modules/src/main/java/org/archive/modules/writer/WARCWriterChainProcessor.java b/modules/src/main/java/org/archive/modules/writer/WARCWriterChainProcessor.java
@@ -163,7 +163,9 @@ protected void writeRecords(CrawlURI curi, WARCWriter writer) throws IOException
             if (recordBuilder.shouldBuildRecord(curi)) {
                 WARCRecordInfo record = recordBuilder.buildRecord(curi, concurrentTo);
                 if (record != null) {
+                    record.setWARCFileOffset(writer.getPosition());
                     writer.writeRecord(record);
+                    record.setWARCFilename(writer.getFilenameWithoutOccupiedSuffix());
                     InputStream is = null;
                     try {
                         is = record.getContentStream();
@@ -178,6 +180,7 @@ protected void writeRecords(CrawlURI curi, WARCWriter writer) throws IOException
                     if (concurrentTo == null) {
                         concurrentTo = record.getRecordId();
                     }
+                    recordBuilder.postWrite(record, curi);
                 }
             }
         }