import pandas as pd
from bs4 import BeautifulSoup
def convert_to_xlsx():
with open('sample.xls') as xml_file:
soup = BeautifulSoup(xml_file.read(), 'xml')
writer = pd.ExcelWriter('sample.xlsx')
for sheet in soup.findAll('Worksheet'):
sheet_as_list = []
for row in sheet.findAll('Row'):
sheet_as_list.append([cell.Data.text if cell.Data else '' for cell in row.findAll('Cell')])
pd.DataFrame(sheet_as_list).to_excel(writer, sheet_name=sheet.attrs['ss:Name'], index=False, header=False)
writer.save()
from bs4 import BeautifulSoup
import re
html_doc = """
<nobr><br/><a href="down.php?id=555"><b>2,55 tlm</b></a></nobr>
<nobr>1895.35 MB<br/><a href="down.php?id=555"><b>2,55 tlm</b></a></nobr>
<bobr>1895.35 MB<br/><a href="down.php?id=555"><b>2,55 tlm</b></a></bobr>
<nobr><br/><a href="down.php?id=555"><b>2,56 tlm</b></a></nobr>
<nobr>1896.36 GB<br/><a href="down.php?id=555"><b>2,57 tlm</b></a></nobr>
<nobr>1896.36 MB<br/><a href="down.php?id=556"><b>2,58 tlm</b></a></nobr>
<br>1896.36 MB<br/><a href="down.php?id=556"><b>2,58 tlm</b></a></br>
"""
tbl = BeautifulSoup(html_doc, 'lxml')
tmpt = re.compile("\d*.\d\d\s+MB")
for nobr in tbl.find_all('nobr'):
result = re.search(tmpt, str(nobr))
if result: print(result.group(0))
from bs4 import BeautifulSoup
import re
html_doc = """
<nobr><br/><a href="down.php?id=555"><b>2,55 tlm</b></a></nobr>
<nobr>1895.35 MB<br/><a href="down.php?id=555"><b>2,55 tlm</b></a></nobr>
<nobr><br/><a href="down.php?id=555"><b>2,56 tlm</b></a></nobr>
<nobr>1896.36 GB<br/><a href="down.php?id=555"><b>2,57 tlm</b></a></nobr>
<nobr>1896.36 MB<br/><a href="down.php?id=556"><b>2,58 tlm</b></a></nobr>
"""
tbl = BeautifulSoup(html_doc, 'lxml')
for nobr in tbl.find_all( text=re.compile("\d*.\d\d\s+MB")):
print(nobr)
https://pypi.org/project/ulif.openoffice/0.4/