-
Notifications
You must be signed in to change notification settings - Fork 0
/
US_scrapper.py
128 lines (90 loc) · 4.25 KB
/
US_scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env python
# coding: utf-8
# In[58]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
import itertools
import re
from pymongo import MongoClient
# In[64]:
mongo_client = MongoClient()
db = mongo_client.amazon_us_911_180
col = db.products
# # Original
# ad, first raw: B07DLGZH28
# second in search results B07WH8BJSR, B08PSKLDDQ, B07BPC6F3C, B013IJLUQ2
# ## copy
# In[68]:
#initializing Selenium
def scrapper():
with open ('test.txt') as file:
for line in file:
try:
s = Service('/home/shahin/Downloads/chromedriver')
driver = webdriver.Chrome(service=s)
driver.get("https://amazon.com")
#opening amazon and search for the ASIN
searchbox = driver.find_element(By.XPATH,'//*[@id="twotabsearchtextbox"]')
current_asin = line
searchbox.send_keys(current_asin)
searchbox.send_keys(Keys.RETURN)
#Get the results and see how many results are in page
chosen_products = driver.find_elements(By.CLASS_NAME,'s-image')
#see if there are ads in products by checking Xpath of the "sponsored " label
feature_dict =[]
if len(chosen_products) > 1 :
ads = True
driver.find_element(By.XPATH,"//span[contains(@data-a-popover,'asin={}')]/parent::span/parent::div/parent::div/parent::div/div/h2/a".format(current_asin)).click()
elif len(chosen_products) <= 1:
ads = False
try:
chosen_products[0].click()
except IndexError:
mongo_docs = []
doc_body = {'data': NAN}
mongo_docs.append(doc_body)
result = col.insert_many(mongo_docs)
except:
pass
# ## feature extraction
# In[70]:
features= driver.find_element(By.ID,'detailBullets_feature_div').find_elements(By.TAG_NAME,'ul')
for feature in features:
feature_dict.append(feature.text)
new_feature_lst = []
for i in feature_dict:
i = re.split("\:|\n",i)
new_feature_lst.append(i)
flat_new_list = list(itertools.chain(*new_feature_lst))
flat_new_list
for j in flat_new_list:
if re.search(";",j):
flat_new_list.append('pounds_weight')
flat_new_list.append(j.split(';')[-1].split(maxsplit=1)[0])
j = j.rsplit(';',1)[0]
flat_new_list
result_dict = {flat_new_list[i]: flat_new_list[i + 1] for i in range(0, len(flat_new_list), 2)}
result_dict['ads'] = ads
# ## Other features
# In[71]:
image_src = driver.find_element(By.XPATH,'//*[@id="landingImage"]').get_attribute('src')
BSR = driver.find_element(By.XPATH,'//*[@id="detailBulletsWrapper_feature_div"]/ul[1]/li/span').text
title = driver.find_element(By.ID, 'productTitle').get_attribute('innerHTML')
price = driver.find_element(By.XPATH,'//*[@id="corePrice_desktop"]/div/table/tbody/tr/td[2]/span[1]').text
stars = driver.find_element(By.CLASS_NAME, 'a-icon-alt').get_attribute('innerHTML')
review_count= driver.find_element(By.XPATH,'//*[@id="reviewsMedley"]/div/div[1]/div[2]/div[2]').text
result_dict['image_source'] = image_src
result_dict['Best Seller Rank'] = BSR
result_dict['title'] = title
result_dict['price'] = price
result_dict['stars'] = stars
result_dict['review_count'] = review_count
# In[72]:
mongo_docs = []
doc_body = result_dict
mongo_docs.append(doc_body)
result = col.insert_many(mongo_docs)
# In[ ]: