from bs4 import BeautifulSoup as bs soup = bs(open(filename).read()) [s.extract() for s in soup(['style', 'script'])] tmpText = soup.get_text() text = "".join("".join(tmpText.split('\t')).split('\n')).encode('utf-8').strip() print text