From bfe14cfc74a43d00d16c83e4fad70f369c29a053 Mon Sep 17 00:00:00 2001 From: Jay Hodgson Date: Mon, 29 Jul 2024 14:26:04 -0700 Subject: [PATCH 1/2] SWC-7006 --- .../web/server/servlet/filter/HtmlInjectionFilter.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/sagebionetworks/web/server/servlet/filter/HtmlInjectionFilter.java b/src/main/java/org/sagebionetworks/web/server/servlet/filter/HtmlInjectionFilter.java index 01ce82209a..48402599ad 100644 --- a/src/main/java/org/sagebionetworks/web/server/servlet/filter/HtmlInjectionFilter.java +++ b/src/main/java/org/sagebionetworks/web/server/servlet/filter/HtmlInjectionFilter.java @@ -180,8 +180,9 @@ protected void doFilterInternal( String domain = request.getServerName(); String lowerCaseDomain = domain.toLowerCase(); boolean isSynapseTestSite = - !(lowerCaseDomain.contains("www.synapse.org") || - lowerCaseDomain.contains("127.0.0.1")); + (lowerCaseDomain.contains("staging.") || + lowerCaseDomain.contains("dev.") || + lowerCaseDomain.contains("tst.")); boolean includeBotHtml = isLikelyBot && !isSynapseTestSite; try { // customize data model for this particular page From 285a06a2b27db8ebecb40abd190ee4a694af6daf Mon Sep 17 00:00:00 2001 From: Jay Hodgson Date: Mon, 29 Jul 2024 16:28:06 -0700 Subject: [PATCH 2/2] SWC-7006: revert previous change (disproven hypothesis), and get let crawler find each page of the project search results --- .../server/servlet/filter/CrawlFilter.java | 74 +++++-------------- .../servlet/filter/HtmlInjectionFilter.java | 10 +-- 2 files changed, 20 insertions(+), 64 deletions(-) diff --git a/src/main/java/org/sagebionetworks/web/server/servlet/filter/CrawlFilter.java b/src/main/java/org/sagebionetworks/web/server/servlet/filter/CrawlFilter.java index 07197af81e..c15206f075 100644 --- a/src/main/java/org/sagebionetworks/web/server/servlet/filter/CrawlFilter.java +++ b/src/main/java/org/sagebionetworks/web/server/servlet/filter/CrawlFilter.java @@ -2,8 +2,6 @@ import static org.apache.commons.lang.StringEscapeUtils.escapeHtml; -import com.google.gwt.thirdparty.guava.common.base.Supplier; -import com.google.gwt.thirdparty.guava.common.base.Suppliers; import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; @@ -13,11 +11,11 @@ import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TimeZone; -import java.util.concurrent.TimeUnit; import java.util.regex.Pattern; import java.util.zip.GZIPInputStream; import org.apache.commons.io.IOUtils; @@ -79,8 +77,6 @@ public class CrawlFilter { ""; SynapseClient synapseClient = null; JSONObjectAdapter jsonObjectAdapter = null; - private final Supplier homePageCached = - Suppliers.memoizeWithExpiration(homePageSupplier(), 1, TimeUnit.DAYS); public static final int MAX_CHILD_PAGES = 5; // Markdown processor @@ -97,30 +93,14 @@ public static String removeSynapseWikiWidgets(String markdown) { return wikiWidgetPattern.matcher(markdown).replaceAll(""); } - public String getCachedHomePageHtml() { - return homePageCached.get(); - } - - private Supplier homePageSupplier() { - return new Supplier() { - public String get() { - try { - return getHomePageHtml(); - } catch (JSONObjectAdapterException | RestServiceException e) { - return e.getMessage(); - } - } - }; - } - public void init(SynapseClient synapseClient) { this.synapseClient = synapseClient; df.setTimeZone(TimeZone.getTimeZone("UTC")); } - private String getHomePageHtml() - throws JSONObjectAdapterException, RestServiceException { + public String getHomePageHtml() + throws JSONObjectAdapterException, RestServiceException, UnsupportedEncodingException { StringBuilder html = new StringBuilder(); // add direct links to all public projects in the system @@ -129,8 +109,11 @@ private String getHomePageHtml() projectsOnly.setKey("node_type"); projectsOnly.setValue("project"); query.getBooleanQuery().add(projectsOnly); - //limit to 100 at a time - query.setSize(100L); + query.setQueryTerm(Collections.singletonList("")); + //limit to 50 at a time + query.setSize(50L); + query.setStart(0L); + html.append( "

" + DisplayConstants.DEFAULT_PAGE_TITLE + @@ -144,37 +127,14 @@ private String getHomePageHtml() TeamSearch.START_DELIMITER + "0\">Teams


" ); - try { - SearchResults results = synapseClient.search(query); - // append this set to the list - while (results.getHits().size() > 0) { - for (Hit hit : results.getHits()) { - // SWC-5149: send a Project alias link to the crawler if available. - if (hit.getAlias() != null) { - html.append( - "" + - hit.getName() + - "
" - ); - } else { - html.append( - "" + - hit.getName() + - "
" - ); - } - } - long newStart = results.getStart() + results.getHits().size(); - query.setStart(newStart); - results = synapseClient.search(query); - } - } catch (Exception e) { - e.printStackTrace(); - } + + String newJson = EntityFactory.createJSONStringForEntity(query); + + html.append( + "Projects
" + ); html.append(""); return html.toString(); } @@ -613,7 +573,7 @@ public String getAllProjectsHtml(SearchQuery inputQuery) String newJson = EntityFactory.createJSONStringForEntity(inputQuery); html.append( "Next Page
" ); return html.toString(); diff --git a/src/main/java/org/sagebionetworks/web/server/servlet/filter/HtmlInjectionFilter.java b/src/main/java/org/sagebionetworks/web/server/servlet/filter/HtmlInjectionFilter.java index 48402599ad..e5affe8802 100644 --- a/src/main/java/org/sagebionetworks/web/server/servlet/filter/HtmlInjectionFilter.java +++ b/src/main/java/org/sagebionetworks/web/server/servlet/filter/HtmlInjectionFilter.java @@ -180,9 +180,8 @@ protected void doFilterInternal( String domain = request.getServerName(); String lowerCaseDomain = domain.toLowerCase(); boolean isSynapseTestSite = - (lowerCaseDomain.contains("staging.") || - lowerCaseDomain.contains("dev.") || - lowerCaseDomain.contains("tst.")); + !(lowerCaseDomain.contains("www.synapse.org") || + lowerCaseDomain.contains("127.0.0.1")); boolean includeBotHtml = isLikelyBot && !isSynapseTestSite; try { // customize data model for this particular page @@ -200,10 +199,7 @@ protected void doFilterInternal( if (isHomePage) { // use defaults in the dataModel, but also get crawl data if this is a bot if (includeBotHtml) { - dataModel.put( - BOT_BODY_HTML_KEY, - crawlFilter.getCachedHomePageHtml() - ); + dataModel.put(BOT_BODY_HTML_KEY, crawlFilter.getHomePageHtml()); } } else if (path.startsWith("/Synapse")) { Synapse place = new Synapse(placeToken);