From 5c4f42b53eddde3bb1a02d1fdf19264613327818 Mon Sep 17 00:00:00 2001 From: Francesco Meli Date: Thu, 15 Aug 2024 12:29:41 +0200 Subject: [PATCH] fix: name of item fetched from caasa.it, remove ls from locations select, update ttl record on sqlite --- server/db.py | 10 ++++++++++ server/jobs.py | 4 ++-- server/services/scraper.py | 28 +++++++++++++++------------- src/components/Request/Request.tsx | 2 +- 4 files changed, 28 insertions(+), 16 deletions(-) diff --git a/server/db.py b/server/db.py index 7e4bf0c..8f756b1 100644 --- a/server/db.py +++ b/server/db.py @@ -36,6 +36,16 @@ class Meta: database = db table_name = 'locations' + def get_nome_for(self, source): + return self.nome + + def get_provincia_nome_for(self, source): + if source == 'caasa.it': + if self.provincia_nome == "Reggio nell'Emilia": + return 'Reggio Emilia' + return self.nome + + class House(Model): uuid = CharField(primary_key=True) url = CharField() diff --git a/server/jobs.py b/server/jobs.py index faad215..a9bef31 100644 --- a/server/jobs.py +++ b/server/jobs.py @@ -6,8 +6,8 @@ from peewee import fn def fetch_homes(location: Location): - # Check if the lowest updated_at of House is > 1 day ago - one_day_ago = datetime.now() - timedelta(days=1) + # Check if the lowest updated_at of House is > 1 day ago + one_day_ago = datetime.now() - timedelta(seconds=60) lowest_updated_at = House.select(fn.Min(House.updated_at)).where((House.city == location.nome) & (House.province == location.provincia_nome)).scalar() if lowest_updated_at is None or lowest_updated_at < one_day_ago: diff --git a/server/services/scraper.py b/server/services/scraper.py index e141e88..bb9c37d 100644 --- a/server/services/scraper.py +++ b/server/services/scraper.py @@ -10,11 +10,6 @@ class ComuniItalia: HOST = "raw.githubusercontent.com" - def trasform_city_name(name): - if name == "Reggio nell'Emilia": - return "Reggio Emilia" - return name - def fetch(): html_text = get(ComuniItalia.HOST, "matteocontrini/comuni-json/master/comuni.json") json_string = unidecode(html_text) @@ -23,14 +18,14 @@ def fetch(): upsert_record( Location, 'codice', - nome=ComuniItalia.trasform_city_name(comune.get('nome', '')), + nome=comune.get('nome', ''), codice=comune.get('codice', ''), zona_codice=comune['zona']['codice'] if 'zona' in comune else None, zona_nome=comune['zona']['nome'] if 'zona' in comune else None, regione_codice=comune['regione']['codice'] if 'regione' in comune else None, regione_nome=comune['regione']['nome'] if 'regione' in comune else None, provincia_codice=comune['provincia']['codice'] if 'provincia' in comune else None, - provincia_nome=ComuniItalia.trasform_city_name(comune['provincia']['nome'] if 'provincia' in comune else None), + provincia_nome=comune['provincia']['nome'] if 'provincia' in comune else None, sigla=comune.get('sigla', ''), codiceCatastale=comune.get('codiceCatastale', ''), cap=comune['cap'] if 'cap' in comune else None, @@ -251,13 +246,18 @@ def get_main_image(soup): return None def get_comment(soup): - return soup.find('div', attrs={"class": "opinion-main-text"}).get_text(strip=True) + return " ".join(soup.find('div', attrs={"class": "opinion-main-text"}).stripped_strings) + + def generate_readable_title(url, comune): + match = re.search(r'-([a-zA-Z0-9]*\d+[a-zA-Z0-9]*)-', url) + uuid = match.group(1) if match else 'N/A' + return f"{comune} (Caasa.it ID: {uuid})" try: html_text = get(Caasa.HOST, page_link) soup = BeautifulSoup(html_text, "html.parser") price = get_value_from_label(soup, "Prezzo") - title = page_link + title = generate_readable_title(page_link, comune) location = get_value_from_label(soup, "Zona OMI") comment = get_comment(soup) m2_string = get_value_from_label(soup, "Superficie") @@ -341,13 +341,15 @@ def get_final_url(page): max_size = max_size, ) - return "/{province}/{comune}/{types}/in-vendita.html?page={page}&{filters}".format( - province=Caasa.format_name(location.provincia_nome), - comune=Caasa.format_name(location.nome), + url = "/{province}/{comune}/{types}/in-vendita.html?page={page}&{filters}".format( + province=Caasa.format_name(location.get_provincia_nome_for('caasa.it')), + comune=Caasa.format_name(location.get_nome_for('caasa.it')), types='-o-'.join(Caasa.ELEMENT_TYPES), filters=filters, page = page, ) + + return url def get_house_link(item_html): fav_container = item_html.find("div", attrs={"class": "favorite-add"}) @@ -410,7 +412,7 @@ def get_house_link(item_html): # Create a new function that takes both fixed_param and link get_house_data_with_meta = partial( Caasa.get_house_data, - location.nome, + location.get_nome_for('caasa.it'), ) with concurrent.futures.ThreadPoolExecutor(max_workers=30) as executor: diff --git a/src/components/Request/Request.tsx b/src/components/Request/Request.tsx index 6f49bbb..6761e81 100644 --- a/src/components/Request/Request.tsx +++ b/src/components/Request/Request.tsx @@ -20,7 +20,7 @@ interface TreeSelectCity { const Request = ({ className, }: RequestProps) => { - const [citiesTree, setCitiesTree] = useLocalStorage('location', []) + const [citiesTree, setCitiesTree] = useState([]) const [request, setRequest] = useLocalStorage('requestUUID', null) const [error, setError] = useState(null) const [codes, setCodes] = useState(null)