import pandas as pd
import argparse
from pyarrow.parquet import read_schema
import json
parser = argparse.ArgumentParser()
parser.add_argument('--csv2parquet',
nargs=2,
help='csv to parquet'
)
parser.add_argument('--parquet2csv',
nargs=2,
help='parquet to csv'
)
parser.add_argument('--get_schema',
help='to get a schema'
)
args = parser.parse_args()
print(args)
# csv_to_parquet
def csv_to_parquet(src_filename, dst_filename):
df = pd.read_csv(src_filename)
df.to_parquet(dst_filename)
# parquet_to_csv
def parquet_to_csv(src_filename, dst_filename):
df = pd.read_parquet(src_filename)
df.to_csv(dst_filename)
# get schema
def get_schema(filename):
schema = read_schema(filename)
schema_dict = json.loads(schema.metadata[b'org.apache.spark.sql.parquet.row.metadata'])['fields']
print(schema_dict)
# !!!!! main condition
if args.csv2parquet is not None:
csv_to_parquet(args.csv2parquet[0], args.csv2parquet[1])
elif args.parquet2csv is not None:
parquet_to_csv(args.parquet2csv[0], args.parquet2csv[1])
elif args.get_schema is not None:
get_schema(args.get_schema)