import re import urllib from bs4 import BeautifulSoup url = http://journals.plos.org/plosone/article?id=info%3Adoi/10.1371/journal.pone.0162069 response = urllib.urlopen(url) page = response.read() soup = BeautifulSoup(page, lxml) # kill all script and style elements for script in soup( ): script.extract() # rip it out # get text text = soup.get_text() # break into lines and remove leading and trailing space on each lines = (line.strip() for line in text.splitlines()) # break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split( )) # drop blank lines text = '\n'.join(chunk for chunk in chunks if chunk) #print(text) #This command will create the ouput.txt file for you. output = open(ouput.txt,a+) #The format of text is unicode. output.write(text.encode(utf-8)) output.close()