First you have to specify the URL of the page and then grab the html from the page using the Python requests library. You can then parse the output using beautiful soup.
# set base url: url = "https://www.parliament.uk/business/committees/inquiries-a-z/current-open-calls-for-evidence/#jump-link-0" r = requests.get(url) page = r.text soup=bs(page,'lxml') main = soup.find('div',attrs={"id":"ctl00_ctl00_FormContent_SiteSpecificPlaceholder_PageContent_ctlMainBody_wrapperDiv"})
Once you’ve done that, it’s all about extracting the correct information:
inquiry_list=[] # get commons for tag in main.find("h2",attrs={"id":"jump-link-0"}).next_siblings: if tag.name=="h2": break else: if tag.name=="ul": inquiries = tag.findAll('li') for inquiry in inquiries: inquiry_item={} name = inquiry.findAll('a')[0].text weblink = inquiry.findAll('a')[0]["href"] committee = inquiry.findAll('a')[1].text deadline = inquiry.findAll(text=True, recursive=False)[1].replace(' | ','').replace('Deadline ','') inquiry_item['type'] = 'Commons' inquiry_item['inquiry name'] = name inquiry_item['committee'] = committee inquiry_item['deadline'] = deadline inquiry_item['status'] = 'Open' inquiry_item['link'] = weblink inquiry_list.append(inquiry_item) # get lords for tag in main.find("h2",attrs={"id":"jump-link-1"}).next_siblings: if tag.name=="h2": break else: if tag.name=="ul": inquiries = tag.findAll('li') for inquiry in inquiries: inquiry_item={} name = inquiry.findAll('a')[0].text weblink = inquiry.findAll('a')[0]["href"] committee = inquiry.findAll('a')[1].text deadline = inquiry.findAll(text=True, recursive=False)[1].replace(' | ','').replace('Deadline ','') inquiry_item['type'] = 'Lords' inquiry_item['inquiry name'] = name inquiry_item['committee'] = committee inquiry_item['deadline'] = deadline inquiry_item['status'] = 'Open' inquiry_item['link'] = weblink inquiry_list.append(inquiry_item) # get joint for tag in main.find("h2",attrs={"id":"jump-link-2"}).next_siblings: if tag.name=="h2": break else: if tag.name=="ul": inquiries = tag.findAll('li') for inquiry in inquiries: inquiry_item={} name = inquiry.findAll('a')[0].text weblink = inquiry.findAll('a')[0]["href"] committee = inquiry.findAll('a')[1].text deadline = inquiry.findAll(text=True, recursive=False)[1].replace(' | ','').replace('Deadline ','') inquiry_item['type'] = 'Joint' inquiry_item['inquiry name'] = name inquiry_item['committee'] = committee inquiry_item['deadline'] = deadline inquiry_item['status'] = 'Open' inquiry_item['link'] = weblink inquiry_list.append(inquiry_item) # get public for tag in main.find("h2",attrs={"id":"jump-link-3"}).next_siblings: if tag.name=="h2": break else: if tag.name=="ul": inquiries = tag.findAll('li') for inquiry in inquiries: inquiry_item={} weblink = inquiry.findAll('a')[0]["href"] name = inquiry.findAll('a')[0].text deadline = inquiry.findAll(text=True, recursive=False)[0].replace(' | ','').replace('Deadline ','') inquiry_item['type'] = 'Public' inquiry_item['inquiry name'] = name inquiry_item['deadline'] = deadline inquiry_item['status'] = 'Open' inquiry_item['link'] = weblink inquiry_list.append(inquiry_item)
Then you can write it all to a JSON file if you want:
# set output filename: inname = "inquiries_list.json" jsonfile = 'inquiries_list.json' # write the resulting list of dictionaries to a JSON file with UTF8 encoding: with io.open(jsonfile, 'w', encoding='utf-8') as f: f.write(json.dumps(inquiry_list, ensure_ascii=False))