Selenium WebScraping: Try get ProductList but always get same Product

Issue

I am trying to get a Product List of a website with selenium. I prototyped the program and everything worked perfectly but now I built a loop to get all products and it just gives me the same product 484 times(that’s the number of products there are on the website)

Here is my code:

from bs4 import BeautifulSoup as soup  # HTML data structure
from urllib.request import urlopen as uReq  # Web client
import selenium
from selenium import webdriver

# URl to web scrape from
page_url = "https://www.smythstoys.com/at/de-at/spielzeug/lego/c/SM100114"

driver = webdriver.Chrome()
driver.get(page_url)

buttonName = "loadMoreProducts"
loadMoreButton = driver.find_element_by_id(buttonName)
while loadMoreButton is not None:
    try:
        try:
            loadMoreButton.click()
        except selenium.common.exceptions.ElementNotInteractableException:
            break
    except selenium.common.exceptions.ElementClickInterceptedException:
        break

uClient = uReq(page_url)
page_soup = soup(uClient.read(), "html.parser")
uClient.close()

# gets all products
containers = driver.find_elements_by_tag_name('article')
print(len(containers))

# name the output file to write to local disk
out_filename = "smythstoys_product_data.csv"
# header of csv file to be written
headers = "product_name;price; info \n"

# opens file, and writes headers
f = open(out_filename, "w")
f.write(headers)

# loops trough all products
# -----------------------------------------------------------------------
# here is the problem:
for container in driver.find_elements_by_tag_name('article'):

    print("----------------------------------------------------------------------")
    product_name_container = container.find_element_by_xpath("//h2[@class ='prodName trackProduct']")
    product_name = product_name_container.text
    print(product_name)

    price_container = container.find_element_by_xpath("//div[@class ='price']")
    price = price_container.text
    print("price:", price)
# ------------------------------------------------------------------------------------
    try:
        info_container = container.find_element_by_xpath("//span[@class ='decalImage-right']").text
        print(info_container)
        if not info_container:
            info = "no special type"
            print(info)
            print(info_container)
            f.write(product_name + "; " + price + "; " + info + "\n")
            continue
        if info_container == "https://smyths-at-prod-images.storage.googleapis.com/sys-master/images/hed/h5f/8823589830686" \
                             "/lego-hard-to-find-decal_CE.svg":
            info = "seltenes Set"
        elif info_container == "https://smyths-at-prod-images.storage.googleapis.com/sys-master/images/h41/h70" \
                               "/8823587930142/new-decal_CE%20%281%29.svg":
            info = "neues Set"
        elif info_container == "https://smyths-at-prod-images.storage.googleapis.com/sys-master/images/hde/hae" \
                               "/8871381303326/sale-decal_CE.svg":
            info = "Sale"
        else:
            info = "unknown type" + info_container
        print(info)
        print(info_container)
    except NameError:
        print("no atribute")
        if info_container is None:
            info = "unknown type"
            print(info)

    # writes the dataset to file
    f.write(product_name + "; " + price + "; " + info + "\n")

f.close()  # Close the file

My output is:

LEGO Star Wars 75244 Tantive IV
price: 199,99€

no special type

and that 484x

Solution

I’m not sure why you used selenium to get the products when requests can do it smoothly. The following is something you wanna do to get all the products using requests.

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

link = "https://www.smythstoys.com/at/de-at/at/de-at/spielzeug/lego/c/SM100114/load-more?"

params = {'q':':bestsellerRating:productVisible:true','page':'1'}

p = 0

while True:
    params['page'] = p
    r = requests.get(link,params=params,headers={
        'content-type': 'application/json; charset=utf-8'
    })
    soup = BeautifulSoup(r.json()['htmlContent'],"lxml")
    if not soup.select_one("a.trackProduct[href]"):break
    for item in soup.select("a.trackProduct[href]"):
        product_name = item.select_one("h2.prodName").get_text(strip=True)
        product_price = item.select_one("[itemprop='price']").get("content")
        print(product_name,product_price)

    p+=1

Answered By – SIM

This Answer collected from stackoverflow, is licensed under cc by-sa 2.5 , cc by-sa 3.0 and cc by-sa 4.0

Leave a Reply

(*) Required, Your email will not be published