from bs4 import BeautifulSoup
content_table = """
<table>
<thead>
<th>ID</th>
<th>Vendor</th>
<th>Product</th>
</thead>
<tr>
<td>1</td>
<td>Intel</td>
<td>Processor</td>
</tr>
<tr>
<td>2</td>
<td>AMD</td>
<td>GPU</td>
</tr>
<tr>
<td>3</td>
<td>Gigabyte</td>
<td>Mainboard</td>
</tr>
</table>
"""
soup = BeautifulSoup(content_table, 'html.parser')
headers = {}
rows = soup.find_all("tr")
thead = soup.find("thead").find_all("th")
for i in range(len(thead)):
headers[i] = thead[i].text.strip().lower()
data = []
for row in rows:
cells = row.find_all("td")
item = {}
for index in headers:
item[headers[index]] = cells[index].text
data.append(item)
print(data)
from pywebhdfs.webhdfs import PyWebHdfsClient
from pprint import pprint
hdfs = PyWebHdfsClient(host='hadoop01',port='50070', user_name='hadoop') # your Namenode IP & username here
my_dir = '/examples/Reutov_mos_obl.csv'
pprint(hdfs.list_dir(my_dir))
for row in cursor.fetchall():
count_rows+=1
result_inside={}
row_content=[]
for col, val in zip(col_names, row):
# проверяем наличие атрибута у объекта, если CLOB то приеняем read
if hasattr(val, 'read'):
result_inside[col] = val.read()
else:
result_inside[col] = val
row_content.append(result_inside[col])
json_file.write(json.dumps(result_inside, default = myconverter))
for row in cursor:
count_rows+=1
result_inside={}
row_content=[]
for col, val in zip(col_names, row):
# проверяем наличие атрибута у объекта, если CLOB то приеняем read
if hasattr(val, 'read'):
result_inside[col] = val.read()
else:
result_inside[col] = val
row_content.append(result_inside[col])
json_file.write(json.dumps(result_inside, default = myconverter))
import sys
import csv
maxInt = sys.maxsize
decrement = True
while decrement:
# decrease the maxInt value by factor 10
# as long as the OverflowError occurs.
decrement = False
try:
csv.field_size_limit(maxInt)
except OverflowError:
maxInt = int(maxInt/10)
decrement = True
with open(filename,"r", encoding='UTF8') as f:
reader = csv.reader(f,delimiter = ";")
data = list(reader)
row_count = len(data)
print(row_count/2)