javascript - Python: How to access a webpage, click specific links and copy the data within them to a text file? -
i quite new python , programming, , know how write simple scripts routine office work. however, have run scenario have use python access particular webpage, search output of particular bioinformatics web server.
in webpage, there table, in second column hyperlink opens little pop box fasta file of protein sequence.
i able write script clicks these links systematically, 1 after other, copy fasta sequence of each of links, , paste them text file.
is kind of automation possible python? if so, start, in terms of modules access internet explorer/webpages etc.? if kindly guide me in right direction or give me example script, try , myself!
thank much!
i post have tried, have literally no idea start!
this takes minute , half run me, after opens text file sequences. need of course add credentials, etc. @ end.
import os import mechanize import cookielib bs4 import beautifulsoup urlparse import urljoin class sequencedownloader(object): def __init__(self, base_url, analyzes_page, email, password, result_path): self.base_url = base_url self.login_page = urljoin(self.base_url, 'login') self.analyzes_page = urljoin(self.base_url, analyzes_page) self.email = email self.password = password self.result_path = result_path self.browser = mechanize.browser() self.browser.set_handle_robots(false) # set cookie cj = cookielib.cookiejar() self.browser.set_cookiejar(cj) def login(self): self.browser.open(self.login_page) # select first (and only) form , log in self.browser.select_form(nr=0) self.browser.form['email'] = self.email self.browser.form['password'] = self.password self.browser.submit() def get_html(self, url): self.browser.open(url) return self.browser.response().read() def scrape_overview_page(self, html): sequences = [] soup = beautifulsoup(html) table = soup.find('table', {'class': 'styled data-table'}) table_body = table.find('tbody') rows = table_body.find_all('tr', {'class': 'search_result'}) row in rows: cols = row.find_all('td') sequence_url = cols[1].a.get('href') sequence_html = self.get_html(sequence_url) sequence_soup = beautifulsoup(sequence_html) sequence = sequence_soup.find('pre').text sequences.append(sequence) return sequences def save(self, sequences): open(result_path, 'w') f: sequence in sequences: f.write(sequence + '\n') def get_sequences(self): self.login() overview_html = self.get_html(self.analyzes_page) sequences = self.scrape_overview_page(overview_html) self.save(sequences) if __name__ == '__main__': base_url = r'https://usgene.sequencebase.com' analyzes_page = 'user/reports/123/analyzes/9876' email = 'user1998510@gmail.com' password = 'yourpassword' result_path = r'c:path\to\result.fasta' sd = sequencedownloader(base_url, analyzes_page, email, password, result_path) sd.get_sequences() os.startfile(result_path)
Comments
Post a Comment