Issue
I was scraping data from a website and I want to store this in a format like JSON, excel, sqlite or text format so that the data looks organised and sensible. Please help me.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get('https://www.amazon.in/Skybags-Brat-Black-Casual-Backpack/dp/B08Z1HHHTD/ref=sr_1_2?dchild=1&keywords=skybags&qid=1627786382&sr=8-2')
product_title = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "productTitle"))).text
print(product_title)
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//a[@data-hook='see-all-reviews-link-foot']"))).click()
while True:
for item in WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "[data-hook='review']"))):
reviewer = item.find_element_by_css_selector("span.a-profile-name").text
review = ' '.join([i.text.strip() for i in item.find_elements_by_xpath(".//span[@data-hook='review-body']")])
print(reviewer,review)
try:
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//*[@data-hook='pagination-bar']//a[contains(@href,'/product-reviews/') and contains(text(),'Next page')]"))).click()
WebDriverWait(driver, 10).until(EC.staleness_of(item))
except Exception as e:
break
driver.quit()
Solution
Store the values product_title
, review
and reviewer
in a dictionary and convert that to a Json format using json
module.
You may store the data in this format and finally convert the list to JSON.
lst = [{"product_title": <title>, "reviews": [{"review": <review>, "reviewer": <reviewer>}, {"review": <review>, "reviewer": <reviewer>}....]
import json
json.dumps(lst)
To write the data to a JSON file
with open('data.json', 'w', encoding='utf-8') as f:
json.dump(lst , f, ensure_ascii=False)
Answered By – Ram
This Answer collected from stackoverflow, is licensed under cc by-sa 2.5 , cc by-sa 3.0 and cc by-sa 4.0