Scrape ajax pages

Issue

I do not anything how to scrape ajax pages there is no pagination on website the website will be load by clicking the load more button these is the page link https://aaos22.mapyourshow.com/8_0/explore/exhibitor-gallery.cfm?featured=false

import scrapy
from scrapy.http import Request
from selenium import webdriver
from scrapy_selenium import SeleniumRequest
import pandas  as pd

class TestSpider(scrapy.Spider):
    name = 'test'
    
    
    def start_requests(self):
        yield SeleniumRequest(
            url="https://aaos22.mapyourshow.com/8_0/explore/exhibitor-gallery.cfm?featured=false",
            wait_time=3,
            screenshot=True,
            callback=self.parse,
            dont_filter=True
        )
        
    def parse(self, response):
        books = response.xpath("//h3[@class='card-Title\nbreak-word\nf3\nmb1\nmt0']//a//@href").extract()
        
        for book in books:
            url = response.urljoin(book)
            yield Request(url, callback=self.parse_book)
            
    def parse_book(self, response):
        title = response.css(".mr3-m::text").get()
        
        address = response.css(".showcase-address::text").get()
        address=address.strip()
        
        
        website = response.xpath("//li[@class='dib  ml3  mr3']//a[starts-with(@href, 'http')]/@href").get() 
        website=website.strip()
        
        phone = response.xpath("//li[@class='dib  ml3  mr3'] //span[contains(text(), 'Phone:')]/following-sibling::text()").get()
        phone=phone.strip().replace("-","")
        
        
        yield{
            'title':title,
            'address':address,
            'website':website,
            'phone':phone
            
        }
    
    

Solution

Okay, try the following script to get all the fields you wish to grab from there traversing all the exhibitor list:

import scrapy
from scrapy.selector import Selector

class MapYourShowSpider(scrapy.Spider):
    name = "mapyourshow"

    content_url = 'https://aaos22.mapyourshow.com/8_0/ajax/remote-proxy.cfm'
    inner_base = 'https://aaos22.mapyourshow.com/8_0/exhibitor/exhibitor-details.cfm?exhid={}'

    headers = {
        'x-requested-with': 'XMLHttpRequest',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
    }
    params = {
        'action': 'search',
        'searchtype': 'exhibitorgallery',
        'searchsize': '557',
        'start': '0',
    }

    def start_requests(self):
        yield scrapy.FormRequest(
            url=self.content_url,
            method='GET',
            headers=self.headers,
            formdata=self.params,
            callback=self.parse,
        )

    def parse(self,response):
        for item in response.json()['DATA']['results']['exhibitor']['hit']:
            inner_link = self.inner_base.format(item['fields']['exhid_l'])
            yield scrapy.Request(
                url=inner_link,
                headers=self.headers,
                callback=self.parse_content,
            )

    def parse_content(self,response):
        elem = response.json()['DATA']['BODYHTML']
        sel = Selector(text=elem)
        title = sel.css("h2::text").get()
        try:
            address = ' '.join([' '.join(i.split()) for i in sel.css("p.showcase-address::text").getall()])
        except AttributeError: address = ""
        website = sel.css("a[title*='website']::text").get()
        phone = sel.xpath("normalize-space(//*[starts-with(@class,'showcase-web-phone')]/li[./*[.='Phone:']]/span/following::text())").get()
        yield {"title":title,"address":address,"website":website,"phone":phone}

Answered By – SIM

This Answer collected from stackoverflow, is licensed under cc by-sa 2.5 , cc by-sa 3.0 and cc by-sa 4.0

Leave a Reply

(*) Required, Your email will not be published