Skip to content

Commit

Permalink
feat: wip
Browse files Browse the repository at this point in the history
  • Loading branch information
bolinocroustibat committed Sep 11, 2024
1 parent c6424a6 commit 32ac865
Showing 1 changed file with 33 additions and 22 deletions.
55 changes: 33 additions & 22 deletions udata_hydra/crawl/select_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,28 +48,39 @@ async def select_batch_resources_to_check() -> list[Record]:

# if not enough for our batch size, handle outdated checks
if len(to_check) < config.BATCH_SIZE:
since = parse_timespan(config.SINCE) # in seconds
since = datetime.now(timezone.utc) - timedelta(seconds=since)
limit = config.BATCH_SIZE - len(to_check)
q = f"""
SELECT * FROM (
SELECT catalog.url, dataset_id, catalog.resource_id
FROM catalog
AND {excluded}
JOIN checks ON catalog.last_check = checks.id
WHERE (
(checks.detected_last_modified_at IS NULL AND checks.created_at < CURRENT_DATE - INTERVAL '60 days')
OR
(checks.detected_last_modified_at IS NOT NULL AND (
(checks.created_at >= checks.detected_last_modified_at + INTERVAL '30 days')
OR
(checks.created_at >= checks.detected_last_modified_at + INTERVAL '86400 seconds')
))
)
AND catalog.priority = False
) s
ORDER BY random() LIMIT {limit};
"""
to_check += await connection.fetch(q)

DELAYS = ["12 hours", "1 day", "7 days", "30 days"]

# Base query parts
query_start = """
SELECT * FROM (
SELECT catalog.url, dataset_id, catalog.resource_id
FROM catalog
JOIN checks ON catalog.last_check = checks.id
WHERE (
(checks.detected_last_modified_at IS NULL AND checks.created_at < CURRENT_DATE - INTERVAL '60 days')
OR
(checks.detected_last_modified_at IS NOT NULL AND (
"""

query_end = """
))
)
AND catalog.priority = False
) s
ORDER BY random() LIMIT 40;
"""

# Construct the dynamic part of the query
dynamic_conditions = " OR ".join(
f"(checks.created_at >= checks.detected_last_modified_at + INTERVAL '{delay}' AND checks.created_at < CURRENT_DATE - INTERVAL '{delay}')"
for delay in DELAYS
)

# Combine all parts to form the final query
final_query = f"{query_start} {dynamic_conditions} {query_end}"

to_check += await connection.fetch(final_query)

return to_check

0 comments on commit 32ac865

Please sign in to comment.