Issue
I am trying to get a Product List of a website with selenium. I prototyped the program and everything worked perfectly but now I built a loop to get all products and it just gives me the same product 484 times(that’s the number of products there are on the website)
Here is my code:
from bs4 import BeautifulSoup as soup # HTML data structure
from urllib.request import urlopen as uReq # Web client
import selenium
from selenium import webdriver
# URl to web scrape from
page_url = "https://www.smythstoys.com/at/de-at/spielzeug/lego/c/SM100114"
driver = webdriver.Chrome()
driver.get(page_url)
buttonName = "loadMoreProducts"
loadMoreButton = driver.find_element_by_id(buttonName)
while loadMoreButton is not None:
try:
try:
loadMoreButton.click()
except selenium.common.exceptions.ElementNotInteractableException:
break
except selenium.common.exceptions.ElementClickInterceptedException:
break
uClient = uReq(page_url)
page_soup = soup(uClient.read(), "html.parser")
uClient.close()
# gets all products
containers = driver.find_elements_by_tag_name('article')
print(len(containers))
# name the output file to write to local disk
out_filename = "smythstoys_product_data.csv"
# header of csv file to be written
headers = "product_name;price; info \n"
# opens file, and writes headers
f = open(out_filename, "w")
f.write(headers)
# loops trough all products
# -----------------------------------------------------------------------
# here is the problem:
for container in driver.find_elements_by_tag_name('article'):
print("----------------------------------------------------------------------")
product_name_container = container.find_element_by_xpath("//h2[@class ='prodName trackProduct']")
product_name = product_name_container.text
print(product_name)
price_container = container.find_element_by_xpath("//div[@class ='price']")
price = price_container.text
print("price:", price)
# ------------------------------------------------------------------------------------
try:
info_container = container.find_element_by_xpath("//span[@class ='decalImage-right']").text
print(info_container)
if not info_container:
info = "no special type"
print(info)
print(info_container)
f.write(product_name + "; " + price + "; " + info + "\n")
continue
if info_container == "https://smyths-at-prod-images.storage.googleapis.com/sys-master/images/hed/h5f/8823589830686" \
"/lego-hard-to-find-decal_CE.svg":
info = "seltenes Set"
elif info_container == "https://smyths-at-prod-images.storage.googleapis.com/sys-master/images/h41/h70" \
"/8823587930142/new-decal_CE%20%281%29.svg":
info = "neues Set"
elif info_container == "https://smyths-at-prod-images.storage.googleapis.com/sys-master/images/hde/hae" \
"/8871381303326/sale-decal_CE.svg":
info = "Sale"
else:
info = "unknown type" + info_container
print(info)
print(info_container)
except NameError:
print("no atribute")
if info_container is None:
info = "unknown type"
print(info)
# writes the dataset to file
f.write(product_name + "; " + price + "; " + info + "\n")
f.close() # Close the file
My output is:
LEGO Star Wars 75244 Tantive IV
price: 199,99€
no special type
and that 484x
Solution
I’m not sure why you used selenium to get the products when requests can do it smoothly. The following is something you wanna do to get all the products using requests.
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
link = "https://www.smythstoys.com/at/de-at/at/de-at/spielzeug/lego/c/SM100114/load-more?"
params = {'q':':bestsellerRating:productVisible:true','page':'1'}
p = 0
while True:
params['page'] = p
r = requests.get(link,params=params,headers={
'content-type': 'application/json; charset=utf-8'
})
soup = BeautifulSoup(r.json()['htmlContent'],"lxml")
if not soup.select_one("a.trackProduct[href]"):break
for item in soup.select("a.trackProduct[href]"):
product_name = item.select_one("h2.prodName").get_text(strip=True)
product_price = item.select_one("[itemprop='price']").get("content")
print(product_name,product_price)
p+=1
Answered By – SIM
This Answer collected from stackoverflow, is licensed under cc by-sa 2.5 , cc by-sa 3.0 and cc by-sa 4.0