-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #93 from klockla/listurl_github
Add method ListURLs to list all URLs known in the frontier with their next fetch date
- Loading branch information
Showing
14 changed files
with
24,068 additions
and
19,989 deletions.
There are no files selected for viewing
3,485 changes: 2,102 additions & 1,383 deletions
3,485
API/src/main/java/crawlercommons/urlfrontier/URLFrontierGrpc.java
Large diffs are not rendered by default.
Oops, something went wrong.
39,708 changes: 21,129 additions & 18,579 deletions
39,708
API/src/main/java/crawlercommons/urlfrontier/Urlfrontier.java
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
162 changes: 162 additions & 0 deletions
162
client/src/main/java/crawlercommons/urlfrontier/client/ListURLs.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,162 @@ | ||
// SPDX-FileCopyrightText: 2020 Crawler-commons | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
package crawlercommons.urlfrontier.client; | ||
|
||
import com.google.protobuf.InvalidProtocolBufferException; | ||
import com.google.protobuf.util.JsonFormat; | ||
import com.google.protobuf.util.JsonFormat.Printer; | ||
import crawlercommons.urlfrontier.URLFrontierGrpc; | ||
import crawlercommons.urlfrontier.URLFrontierGrpc.URLFrontierBlockingStub; | ||
import crawlercommons.urlfrontier.Urlfrontier.ListUrlParams.Builder; | ||
import crawlercommons.urlfrontier.Urlfrontier.URLItem; | ||
import io.grpc.ManagedChannel; | ||
import io.grpc.ManagedChannelBuilder; | ||
import java.io.File; | ||
import java.io.IOException; | ||
import java.io.PrintStream; | ||
import java.nio.charset.Charset; | ||
import java.nio.file.Files; | ||
import java.time.Instant; | ||
import java.time.LocalDateTime; | ||
import java.time.ZoneId; | ||
import java.util.Iterator; | ||
import picocli.CommandLine.Command; | ||
import picocli.CommandLine.Option; | ||
import picocli.CommandLine.ParentCommand; | ||
|
||
@Command( | ||
name = "ListURLs", | ||
description = "Prints out all URLs in the Frontier", | ||
sortOptions = false) | ||
public class ListURLs implements Runnable { | ||
|
||
@ParentCommand private Client parent; | ||
|
||
@Option( | ||
names = {"-n", "--number_urls"}, | ||
defaultValue = "100", | ||
paramLabel = "NUM", | ||
description = "maximum number of URLs to return (default 100)") | ||
private int maxNumURLs; | ||
|
||
@Option( | ||
names = {"-s", "--start"}, | ||
defaultValue = "0", | ||
paramLabel = "NUM", | ||
description = "starting position of URL to return (default 0)") | ||
private int start; | ||
|
||
@Option( | ||
names = {"-k", "--key"}, | ||
required = false, | ||
paramLabel = "STRING", | ||
description = "key to use to target a specific queue") | ||
private String key; | ||
|
||
@Option( | ||
names = {"-o", "--output"}, | ||
defaultValue = "", | ||
paramLabel = "STRING", | ||
description = "output file to dump all the URLs") | ||
private String output; | ||
|
||
@Option( | ||
names = {"-c", "--crawlID"}, | ||
defaultValue = "DEFAULT", | ||
paramLabel = "STRING", | ||
description = "crawl to get the queues for") | ||
private String crawl; | ||
|
||
@Option( | ||
names = {"-l", "--local"}, | ||
defaultValue = "false", | ||
paramLabel = "BOOLEAN", | ||
description = | ||
"restricts the scope to this frontier instance instead of aggregating over the cluster") | ||
private Boolean local; | ||
|
||
@Option( | ||
names = {"-j", "--json"}, | ||
defaultValue = "false", | ||
paramLabel = "BOOLEAN", | ||
description = "Outputs in JSON format") | ||
private Boolean json; | ||
|
||
@Option( | ||
names = {"-p", "--parsedate"}, | ||
defaultValue = "false", | ||
description = { | ||
"Print the refetch date in local time zone", | ||
"By default, time is UTC seconds since the Unix epoch", | ||
"Ignored if JSON output is selected" | ||
}) | ||
private boolean parse; | ||
|
||
// Use the system default time zone | ||
private ZoneId zoneId = ZoneId.systemDefault(); | ||
|
||
@Override | ||
public void run() { | ||
|
||
Builder builder = crawlercommons.urlfrontier.Urlfrontier.ListUrlParams.newBuilder(); | ||
builder.setLocal(local); | ||
if (key != null) { | ||
builder.setKey(key); | ||
} | ||
builder.setSize(maxNumURLs); | ||
builder.setStart(start); | ||
builder.setCrawlID(crawl); | ||
|
||
PrintStream outstream = null; | ||
if (output.length() > 0) { | ||
File f = new File(output); | ||
try { | ||
Files.deleteIfExists(f.toPath()); | ||
outstream = new PrintStream(f, Charset.defaultCharset()); | ||
} catch (IOException e) { | ||
e.printStackTrace(System.err); | ||
return; | ||
} | ||
} else { | ||
outstream = System.out; | ||
} | ||
|
||
Printer jprinter = JsonFormat.printer(); | ||
|
||
ManagedChannel channel = | ||
ManagedChannelBuilder.forAddress(parent.hostname, parent.port) | ||
.usePlaintext() | ||
.build(); | ||
URLFrontierBlockingStub blockingFrontier = URLFrontierGrpc.newBlockingStub(channel); | ||
|
||
Iterator<URLItem> it = blockingFrontier.listURLs(builder.build()); | ||
while (it.hasNext()) { | ||
|
||
URLItem item = it.next(); | ||
|
||
String fetchDate; | ||
if (parse) { | ||
Instant instant = Instant.ofEpochSecond(item.getKnown().getRefetchableFromDate()); | ||
LocalDateTime localDate = instant.atZone(zoneId).toLocalDateTime(); | ||
fetchDate = localDate.toString(); | ||
} else { | ||
fetchDate = String.valueOf(item.getKnown().getRefetchableFromDate()); | ||
} | ||
|
||
if (Boolean.TRUE.equals(json)) { | ||
try { | ||
outstream.println(jprinter.print(item)); | ||
} catch (InvalidProtocolBufferException e) { | ||
e.printStackTrace(System.err); | ||
break; | ||
} | ||
} else { | ||
outstream.println(item.getKnown().getInfo().getUrl() + ";" + fetchDate); | ||
} | ||
} | ||
|
||
outstream.close(); | ||
channel.shutdownNow(); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.