-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: Laurent Klock <[email protected]>
- Loading branch information
Showing
14 changed files
with
546 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
11 changes: 6 additions & 5 deletions
11
API/src/main/java/crawlercommons/urlfrontier/Urlfrontier.java
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
130 changes: 130 additions & 0 deletions
130
client/src/main/java/crawlercommons/urlfrontier/client/ListURLs.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,130 @@ | ||
// SPDX-FileCopyrightText: 2020 Crawler-commons | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
package crawlercommons.urlfrontier.client; | ||
|
||
import crawlercommons.urlfrontier.URLFrontierGrpc; | ||
import crawlercommons.urlfrontier.URLFrontierGrpc.URLFrontierBlockingStub; | ||
import crawlercommons.urlfrontier.Urlfrontier.Pagination.Builder; | ||
import crawlercommons.urlfrontier.Urlfrontier.URLItem; | ||
import io.grpc.ManagedChannel; | ||
import io.grpc.ManagedChannelBuilder; | ||
import java.io.File; | ||
import java.io.IOException; | ||
import java.io.PrintStream; | ||
import java.nio.charset.Charset; | ||
import java.time.Instant; | ||
import java.time.LocalDateTime; | ||
import java.time.ZoneId; | ||
import java.util.Iterator; | ||
import picocli.CommandLine.Command; | ||
import picocli.CommandLine.Option; | ||
import picocli.CommandLine.ParentCommand; | ||
|
||
@Command( | ||
name = "ListURLs", | ||
description = "Prints out all URLs in the Frontier", | ||
sortOptions = false) | ||
public class ListURLs implements Runnable { | ||
|
||
@ParentCommand private Client parent; | ||
|
||
@Option( | ||
names = {"-n", "--number_urls"}, | ||
defaultValue = "100", | ||
paramLabel = "NUM", | ||
description = "maximum number of URLs to return (default 100)") | ||
private int maxNumURLs; | ||
|
||
@Option( | ||
names = {"-s", "--start"}, | ||
defaultValue = "0", | ||
paramLabel = "NUM", | ||
description = "starting position of URL to return (default 0)") | ||
private int start; | ||
|
||
@Option( | ||
names = {"-o", "--output"}, | ||
defaultValue = "", | ||
paramLabel = "STRING", | ||
description = "output file to dump all the URLs") | ||
private String output; | ||
|
||
@Option( | ||
names = {"-c", "--crawlID"}, | ||
defaultValue = "DEFAULT", | ||
paramLabel = "STRING", | ||
description = "crawl to get the queues for") | ||
private String crawl; | ||
|
||
@Option( | ||
names = {"-l", "--local"}, | ||
defaultValue = "false", | ||
paramLabel = "BOOLEAN", | ||
description = | ||
"restricts the scope to this frontier instance instead of aggregating over the cluster") | ||
private Boolean local; | ||
|
||
@Option( | ||
names = {"-p", "--parsedate"}, | ||
defaultValue = "false", | ||
description = { | ||
"Print the refetch date in local time zone", | ||
"By default, time is in UTC seconds since the Unix epoch" | ||
}) | ||
private boolean parse; | ||
|
||
// Use the system default time zone | ||
private ZoneId zoneId = ZoneId.systemDefault(); | ||
|
||
@Override | ||
public void run() { | ||
|
||
Builder builder = crawlercommons.urlfrontier.Urlfrontier.Pagination.newBuilder(); | ||
builder.setLocal(local); | ||
builder.setSize(maxNumURLs); | ||
builder.setStart(start); | ||
builder.setIncludeInactive(true); | ||
builder.setCrawlID(crawl); | ||
|
||
PrintStream outstream = null; | ||
if (output.length() > 0) { | ||
File f = new File(output); | ||
f.delete(); | ||
try { | ||
outstream = new PrintStream(f, Charset.defaultCharset()); | ||
} catch (IOException e) { | ||
e.printStackTrace(System.err); | ||
return; | ||
} | ||
} else { | ||
outstream = System.out; | ||
} | ||
|
||
ManagedChannel channel = | ||
ManagedChannelBuilder.forAddress(parent.hostname, parent.port) | ||
.usePlaintext() | ||
.build(); | ||
URLFrontierBlockingStub blockingFrontier = URLFrontierGrpc.newBlockingStub(channel); | ||
|
||
Iterator<URLItem> it = blockingFrontier.listURLs(builder.build()); | ||
while (it.hasNext()) { | ||
|
||
URLItem item = it.next(); | ||
|
||
String fetchDate; | ||
if (parse) { | ||
Instant instant = Instant.ofEpochSecond(item.getKnown().getRefetchableFromDate()); | ||
LocalDateTime localDate = instant.atZone(zoneId).toLocalDateTime(); | ||
fetchDate = localDate.toString(); | ||
} else { | ||
fetchDate = String.valueOf(item.getKnown().getRefetchableFromDate()); | ||
} | ||
|
||
outstream.println(item.getKnown().getInfo().getUrl() + ";" + fetchDate); | ||
} | ||
|
||
outstream.close(); | ||
channel.shutdownNow(); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.