I was able to get it working using these references:
http://arrow.apache.org/docs/python/generated/pyarrow.Table.html
http://arrow.apache.org/docs/python/generated/pyarrow.Field.html
Basically it loops through the original table and creates new columns (pa.array) with the adjusted text that it appends to a new table. It’s probably not the best way to do it, but it worked. Most importantly, it let me preserve the nulls and specify the data type of each column.
import sys, getopt
import random
import re
import math
import pyarrow.parquet as pq
import pyarrow.csv as pcsv
import numpy as np
import pandas as pd
import pyarrow as pa
import os.path
<a lot of other code here>
changed_ct = 0
all_cols_ct = 0
table3 = pa.Table.from_arrays([pa.array(range(0,863))], names=('0')) # CREATE TEMP COLUMN!!
#print(table3)
#exit()
changed_column_list = []
for col_name in table2.column_names:
print('processing column: ' + col_name)
new_list = []
col_data = pa.Table.column(table2, col_name)
col_data_type = table2.schema.field(col_name).type
printed_changed_flag = False
for i in col_data:
# GET STRING REPRESENTATION OF THE COLUMN DATA
if(col_data_type == 'string'):
col_str = pa.StringScalar.as_py(i)
elif(col_data_type == 'int32'):
col_str = pa.Int32Scalar.as_py(i)
elif(col_data_type == 'int64'):
col_str = pa.Int64Scalar.as_py(i)
if col_name in change_columns:
if printed_changed_flag == False:
print('changing values in column ' + col_name)
changed_column_list.append(col_name)
changed_ct += 1
printed_changed_flag = True
new_list.append(change_str(col_str))
else:
new_list.append(col_str)
#set data type for the column
if(col_data_type == 'string'):
col_data_type = pa.string()
elif(col_data_type == 'int32'):
col_data_type = pa.int32()
elif(col_data_type == 'int64'):
col_data_type = pa.int64()
arr = pa.array(new_list, type=col_data_type)
new_field = pa.field(col_name, col_data_type)
table3 = pa.Table.append_column(table3, new_field, arr)
all_cols_ct += 1
#for i in table3:
# print(i)
table3 = pa.Table.remove_column(table3, 0) # REMOVE TEMP COLUMN!!
#print(table2)
#print('-------------------')
#print(table3)
#exit()
print('changed ' + str(changed_ct) + ' columns:')
print(*changed_column_list, sep='\n')
# WRITE NEW PARQUET FILE
pa.parquet.write_table(table3, out_file)