Making Subsequent POST Request In Session Doesn't Work - Web Scraping
Here's what I'm trying to do: go here, then hit 'search'. Grab the data, then hit 'next', and keep hitting next until you're out of pages. Everything up to hitting 'next' works. He
Solution 1:
Well this nearly drove me mental but it is finally working, you have to make a get request to get a new __EVENTVALIDATION
token for each post:
import requests
from bs4 import BeautifulSoup
h = {"X-MicrosoftAjax": "Delta = true",
"X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36"
}
"ctl00$ctl13 | ctl00$MainContent$physicianSearchView$btnSearch"
d = {
"ctl00$ctl13": "ctl00$MainContent$physicianSearchView$btnSearch",
"__EVENTTARGET": "ctl00$MainContent$physicianSearchView$btnSearch",
'ctl00$MainContent$physicianSearchView$hfPrefetchUrl': "http://service.cpsa.ca/OnlineService/OnlineService.svc/Services/GetAlbertaCities?name=",
'ctl00$MainContent$physicianSearchView$hfRemoveUrl': "http://service.cpsa.ca/OnlineService/OnlineService.svc/Services/GetAlbertaCities?name=%QUERY",
'__ASYNCPOST': 'true'}
nxt_d = {
"ctl00$ctl13": "ctl00$MainContent$physicianSearchView$ResultsPanel|ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager",
"ctl00$MainContent$physicianSearchView$gvResults$ctl01$ddlPager": "2",
"ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager": "1",
"__ASYNCPOST": "true",
"__EVENTTARGET": "ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager"}
url = "http://search.cpsa.ca/PhysicianSearch"
with requests.session() as s:
r = s.get(url, headers=h)
soup = BeautifulSoup(r.content, "lxml")
ev = soup.select("#__EVENTVALIDATION")[0]["value"]
vs = soup.select("#__VIEWSTATE")[0]["value"]
d["__EVENTVALIDATION"] = ev
d["__VIEWSTATE"] = vs
r = s.post(url, data=d, headers=h)
soup = BeautifulSoup(s.get("http://search.cpsa.ca/PhysicianSearch").content, "lxml")
ev = soup.select("#__EVENTVALIDATION")[0]["value"]
vs = soup.select("#__VIEWSTATE")[0]["value"]
nxt_d["__EVENTVALIDATION"] = ev
nxt_d["__VIEWSTATE"] = vs
r = s.post(url, data=nxt_d, headers=h)
If you open the source from the last post you will see you hit page 2. We need to add more logic to get through all the pages, I will add it in a bit.
The params:
"ctl00$MainContent$physicianSearchView$gvResults$ctl01$ddlPager": "2",
"ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager": "1"
are the page to go to and the page you are coming from so that after a get should be all that needs to change.
This will get all the pages, pulling most of the values programmatically, you could probably pull more especially with the aid of a regex but it pulls most without hard coding values:
from lxml.html import fromstring
import requests
class Crawler(object):
def __init__(self, ua, url):
self.user_agent = ua
self.post_header = {"X-MicrosoftAjax": "Delta = true", "X-Requested-With": "XMLHttpRequest", "user-agent": ua}
self.post_data2 = {'__ASYNCPOST': 'true',
"ctl00$ctl13": "ctl00$MainContent$physicianSearchView$ResultsPanel|ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager"}
self.url = url
self.post_data1 = { '__ASYNCPOST': 'true'}
def populate(self, xml):
"""Pulls form post data keys and values for initial post."""
k1 = xml.xpath("//*[@id='hfPrefetchUrl']")[0]
k2 = xml.xpath("//*[@id='hfRemoveUrl']")[0]
self.post_data1[k1.get("name")] = k1.get("value")
self.post_data1[k2.get("name")] = k2.get("value")
self.post_data1["ctl00$ctl13"] = xml.xpath("//input[@value='Search']/@name")[0]
self.post_data1["__EVENTTARGET"] = self.post_data1["ctl00$ctl13"]
def populate2(self, xml):
"""Pulls form post data keys and values,
for all subsequent posts,
setting initial page number values.
"""
data = xml.xpath("//*[@id='MainContent_physicianSearchView_gvResults_ddlPager']/@name")
self.pge = data[0]
self.ev = data[1]
self.post_data2["__EVENTTARGET"] = self.ev
self.post_data2[self.ev] = "1"
self.post_data2[self.pge] = "2"
@staticmethod
def put_validation(xml, d):
"""Need to request new __EVENTVALIDATION for each post.
"""
ev = xml.xpath("//*[@id='__EVENTVALIDATION']/@value")[0]
vs = xml.xpath("//*[@id='__VIEWSTATE']/@value")[0]
d["__EVENTVALIDATION"] = ev
d["__VIEWSTATE"] = vs
def next_page(self, d):
"""Increments the page number by one per iteration."""
e = self.post_data2[self.ev]
v = self.post_data2[self.pge]
self.post_data2[self.pge] = str(int(v) + 1)
self.post_data2[self.ev] = str(int(e) + 1)
def start(self):
with requests.session() as s:
# get initial page to pull __EVENTVALIDATION etc..
req = s.get(self.url, headers={"user-agent": self.user_agent}).content
# add __EVENTVALIDATION" to post data.
self.put_validation(fromstring(req), self.post_data1)
xml = fromstring(req)
# populate the rest of the post data.
self.populate(xml)
resp = fromstring(s.post(self.url, data=self.post_data1, headers=self.post_header).content)
# yield first page results.
yield resp
# fill post data for next pages.
self.populate2(resp)
# when this is an empty list, we will have hit the last page.
nxt = xml.xpath("//*[@id='MainContent_physicianSearchView_gvResults_btnNextPage']/@disabled")
while not nxt:
# update __EVENTVALIDATION token and _VIEWSTATE.
self.put_validation(fromstring(s.get(self.url).content), self.post_data2)
# post to get next page of results.
yield fromstring(s.post(url, data=self.post_data2, headers=self.post_header).content)
nxt = xml.xpath("//*[@id='MainContent_physicianSearchView_gvResults_btnNextPage']/@disabled")
self.next_page(nxt_d)
ua = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36"
url = "http://search.cpsa.ca/PhysicianSearch"
c = Crawler(ua, url)
for tree in c.start():
# use tree
Post a Comment for "Making Subsequent POST Request In Session Doesn't Work - Web Scraping"