From 6aa7b51ef06a6f077bc70021577297762cb8318d Mon Sep 17 00:00:00 2001 From: Fabien Boucher Date: Wed, 20 Sep 2023 10:26:17 +0000 Subject: [PATCH] cli - Add the Janitor set-crawler-commit-date command --- CHANGELOG.md | 1 + README.md | 9 +++++++++ src/CLI.hs | 18 +++++++++++++++++- src/Monocle/Backend/Index.hs | 23 ++++++++++++++--------- src/Monocle/Backend/Janitor.hs | 18 ++++++++++++++++++ src/Monocle/Config.hs | 4 ++-- 6 files changed, 61 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 293a4c63e..dd55942fc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ All notable changes to this project will be documented in this file. ### Added - [crawler] Enable usage of the GitHub user PRs crawler via the Monocle config. +- [cli] Add a Janitor command to reset the commit date of a crawler ### Changed diff --git a/README.md b/README.md index 76bc3ad74..5b204e279 100644 --- a/README.md +++ b/README.md @@ -625,6 +625,15 @@ docker-compose run --rm --no-deps api monocle janitor wipe-crawler-data --elasti docker-compose start crawler ``` +## Reset the crawler commit date + +Monocle crawlers keep track of the last date (commit date) when a successful document fetch happened. The command +below can be used to force a crawler to fetch (again) documents since another date. + +```bash +docker-compose run --rm --no-deps api monocle janitor set-crawler-commit-date --elastic elastic:9200 --config /etc/monocle/config.yaml --workspace --crawler-name --commit-date 2023-01-01 +``` + ## Components ![architecture](./doc/architecture.png) diff --git a/src/CLI.hs b/src/CLI.hs index 39c2527ac..1e243dae2 100644 --- a/src/CLI.hs +++ b/src/CLI.hs @@ -142,11 +142,13 @@ usageJanitor = subparser ( mkSubCommand "update-idents" "Update author identities" janitorUpdateIdent <> mkSubCommand "wipe-crawler-data" "Remove changes/task-data and events related to a crawler name" janitorRemoveCrawlerData + <> mkSubCommand "set-crawler-commit-date" "Overwrite the crawler commit date" janitorSetCrawlerCommitDate ) where configOption = strOption (long "config" <> O.help "Path to configuration file" <> metavar "MONOCLE_CONFIG") elasticOption = strOption (long "elastic" <> O.help "The Elastic endpoint url" <> metavar "MONOCLE_ELASTIC_URL") workspaceOption = strOption (long "workspace" <> O.help "Workspace name" <> metavar "WORKSPACE") + crawlerNameOption = strOption (long "crawler-name" <> O.help "The crawler name" <> metavar "CRAWLER_NAME") runOnWorkspace env action' workspace = runEff $ runLoggerEffect $ runElasticEffect env $ runEmptyQueryM workspace action' noWorkspace workspaceName = "Unable to find the workspace " <> workspaceName <> " in the Monocle config" janitorUpdateIdent = io <$> parser @@ -163,7 +165,6 @@ usageJanitor = Nothing -> traverse_ (runOnWorkspace env J.updateIdentsOnWorkspace) $ Config.getWorkspaces config janitorRemoveCrawlerData = io <$> parser where - crawlerNameOption = strOption (long "crawler-name" <> O.help "The crawler name" <> metavar "CRAWLER_NAME") parser = (,,,) <$> configOption <*> elasticOption <*> workspaceOption <*> crawlerNameOption io (configPath, elasticUrl, workspaceName, crawlerName) = do config <- Config.loadConfigWithoutEnv configPath @@ -173,6 +174,21 @@ usageJanitor = Just workspace -> do runOnWorkspace env (J.wipeCrawlerData crawlerName) workspace runOnWorkspace env (J.removeTDCrawlerData crawlerName) workspace + janitorSetCrawlerCommitDate = io <$> parser + where + newDateOption = strOption (long "commit-date" <> O.help "The new crawler commit-date" <> metavar "COMMIT_DATE") + parser = (,,,,) <$> configOption <*> elasticOption <*> workspaceOption <*> crawlerNameOption <*> newDateOption + io (configPath, elasticUrl, workspaceName, crawlerName, newDate) = do + config <- Config.loadConfigWithoutEnv configPath + env <- mkEnv $ getURL elasticUrl + void $ case Config.lookupTenant (Config.getWorkspaces config) workspaceName of + Nothing -> print $ noWorkspace workspaceName + Just workspace -> + runOnWorkspace + env + ( J.updateCrawlerMDLastUpdatedDate workspace crawlerName newDate + ) + workspace --------------------------------------------------------------- -- Lentille cli diff --git a/src/Monocle/Backend/Index.hs b/src/Monocle/Backend/Index.hs index c885d9a65..10c9cf0b1 100644 --- a/src/Monocle/Backend/Index.hs +++ b/src/Monocle/Backend/Index.hs @@ -931,15 +931,20 @@ initCrawlerEntities entities worker = traverse_ run entities defaultUpdatedSince = getWorkerUpdatedSince worker initCrawlerMetadata :: MonoQuery :> es => IndexEffects es => Config.Crawler -> Eff es () -initCrawlerMetadata crawler = - initCrawlerEntities - ( getProjectEntityFromCrawler - <> getOrganizationEntityFromCrawler - <> getTaskDataEntityFromCrawler - <> getProjectIssueFromCrawler - <> getUserEntityFromCrawler - ) - crawler +initCrawlerMetadata crawler = initCrawlerEntities (getCrawlerEntities crawler) crawler + +resetCrawlerMetadataLastUpdatedDate :: MonoQuery :> es => IndexEffects es => Config.Crawler -> UTCTime -> Eff es () +resetCrawlerMetadataLastUpdatedDate crawler newDate = do + let crawlerName = CrawlerName $ Config.getCrawlerName crawler + traverse_ (setLastUpdated crawlerName newDate) (getCrawlerEntities crawler) + +getCrawlerEntities :: Config.Crawler -> [Entity] +getCrawlerEntities crawler = + getProjectEntityFromCrawler + <> getOrganizationEntityFromCrawler + <> getTaskDataEntityFromCrawler + <> getProjectIssueFromCrawler + <> getUserEntityFromCrawler where getProjectEntityFromCrawler = Project <$> Config.getCrawlerProject crawler getProjectIssueFromCrawler = ProjectIssue <$> Config.getCrawlerProjectIssue crawler diff --git a/src/Monocle/Backend/Janitor.hs b/src/Monocle/Backend/Janitor.hs index a4a473b84..a2b5cf49b 100644 --- a/src/Monocle/Backend/Janitor.hs +++ b/src/Monocle/Backend/Janitor.hs @@ -5,6 +5,7 @@ module Monocle.Backend.Janitor ( updateIdentsOnEvents, updateIdentsOnChanges, updateIdentsOnWorkspace, + updateCrawlerMDLastUpdatedDate, removeProjectMD, ) where @@ -300,3 +301,20 @@ removeMD entity crawlerName = do >>> I.bulkStream ) logInfo "Deleted metadata" ["crawler" .= crawlerName, "count" .= deletedCount] + +updateCrawlerMDLastUpdatedDate :: QEffects es => Config.Index -> Text -> Text -> Eff es () +updateCrawlerMDLastUpdatedDate index crawlerNameText newDateText = do + let eCheckParam = do + newDate <- + toEither "Unable to parse the date: Expected format YYYY-mm-dd or YYYY-mm-dd hh:mm:ss UTC" + $ parseDateValue (from newDateText) + crawler <- toEither "Unable to find the crawler" $ Config.lookupCrawler index crawlerNameText + pure (newDate, crawler) + case eCheckParam of + Left err -> logInfo err ["crawler" .= crawlerNameText, "newDate" .= newDateText] + Right (newDate, crawler) -> I.resetCrawlerMetadataLastUpdatedDate crawler newDate + where + toEither :: Text -> Maybe a -> Either Text a + toEither msg = \case + Just a -> Right a + Nothing -> Left msg diff --git a/src/Monocle/Config.hs b/src/Monocle/Config.hs index 4f69673da..0b27707c4 100644 --- a/src/Monocle/Config.hs +++ b/src/Monocle/Config.hs @@ -379,9 +379,9 @@ lookupProject index projectName = find isProject (fromMaybe [] (projects index)) -- | Find a 'Crawler' in an 'Index' lookupCrawler :: Index -> Text -> Maybe Crawler -lookupCrawler index crawlerName = find isProject index.crawlers +lookupCrawler index crawlerName = find isCrawler index.crawlers where - isProject Crawler {..} = name == crawlerName + isCrawler Crawler {..} = name == crawlerName -- | Find an 'Ident' in an 'Index' lookupIdent :: Index -> Text -> Maybe Ident