Scraping Open Parliamentary Inquiries

First you have to specify the URL of the page and then grab the html from the page using the Python requests library. You can then parse the output using beautiful soup.

# set base url:
url = "https://www.parliament.uk/business/committees/inquiries-a-z/current-open-calls-for-evidence/#jump-link-0"

r = requests.get(url)
page = r.text
soup=bs(page,'lxml')

main = soup.find('div',attrs={"id":"ctl00_ctl00_FormContent_SiteSpecificPlaceholder_PageContent_ctlMainBody_wrapperDiv"})

Once you’ve done that, it’s all about extracting the correct information:

inquiry_list=[]
# get commons
for tag in main.find("h2",attrs={"id":"jump-link-0"}).next_siblings:
	if tag.name=="h2":
		break
	else:
		if tag.name=="ul":
			inquiries = tag.findAll('li')
			for inquiry in inquiries:
				inquiry_item={}
				name = inquiry.findAll('a')[0].text
				weblink = inquiry.findAll('a')[0]["href"]
				committee = inquiry.findAll('a')[1].text
				deadline = inquiry.findAll(text=True, recursive=False)[1].replace(' | ','').replace('Deadline ','')
				inquiry_item['type'] = 'Commons'
				inquiry_item['inquiry name'] = name
				inquiry_item['committee'] = committee
				inquiry_item['deadline'] = deadline
				inquiry_item['status'] = 'Open'
				inquiry_item['link'] = weblink

				inquiry_list.append(inquiry_item)

# get lords
for tag in main.find("h2",attrs={"id":"jump-link-1"}).next_siblings:
	if tag.name=="h2":
		break
	else:
		if tag.name=="ul":
			inquiries = tag.findAll('li')
			for inquiry in inquiries:
				inquiry_item={}
				name = inquiry.findAll('a')[0].text
				weblink = inquiry.findAll('a')[0]["href"]
				committee = inquiry.findAll('a')[1].text
				deadline = inquiry.findAll(text=True, recursive=False)[1].replace(' | ','').replace('Deadline ','')
				inquiry_item['type'] = 'Lords'
				inquiry_item['inquiry name'] = name
				inquiry_item['committee'] = committee
				inquiry_item['deadline'] = deadline
				inquiry_item['status'] = 'Open'
				inquiry_item['link'] = weblink

				inquiry_list.append(inquiry_item)

# get joint
for tag in main.find("h2",attrs={"id":"jump-link-2"}).next_siblings:
	if tag.name=="h2":
		break
	else:
		if tag.name=="ul":
			inquiries = tag.findAll('li')
			for inquiry in inquiries:
				inquiry_item={}
				name = inquiry.findAll('a')[0].text
				weblink = inquiry.findAll('a')[0]["href"]
				committee = inquiry.findAll('a')[1].text
				deadline = inquiry.findAll(text=True, recursive=False)[1].replace(' | ','').replace('Deadline ','')
				inquiry_item['type'] = 'Joint'
				inquiry_item['inquiry name'] = name
				inquiry_item['committee'] = committee
				inquiry_item['deadline'] = deadline
				inquiry_item['status'] = 'Open'
				inquiry_item['link'] = weblink

				inquiry_list.append(inquiry_item)
# get public
for tag in main.find("h2",attrs={"id":"jump-link-3"}).next_siblings:
	if tag.name=="h2":
		break
	else:
		if tag.name=="ul":
			inquiries = tag.findAll('li')
			for inquiry in inquiries:
				inquiry_item={}
				weblink = inquiry.findAll('a')[0]["href"]
				name = inquiry.findAll('a')[0].text
				deadline = inquiry.findAll(text=True, recursive=False)[0].replace(' | ','').replace('Deadline ','')
				inquiry_item['type'] = 'Public'
				inquiry_item['inquiry name'] = name
				inquiry_item['deadline'] = deadline
				inquiry_item['status'] = 'Open'
				inquiry_item['link'] = weblink

				inquiry_list.append(inquiry_item)

Then you can write it all to a JSON file if you want:

# set output filename:
inname = "inquiries_list.json"

jsonfile = 'inquiries_list.json'
# write the resulting list of dictionaries to a JSON file with UTF8 encoding:
with io.open(jsonfile, 'w', encoding='utf-8') as f:
	f.write(json.dumps(inquiry_list, ensure_ascii=False))
%d bloggers like this: