Convert dictionary to df |
pd.DataFrame(data) |
pl.DataFrame(data) |
Make 2D Numpy array into df |
pd.DataFrame(my_ar, columns=['col1', 'col2', 'col3']) |
pl.DataFrame(my_ar, schema=['col1', 'col2', 'col3'], orient='row') |
Read from CSV |
pd.read_csv(file_name) |
pl.read_csv(file_name) |
Lazily read CSV |
— |
pl.scan_csv(file_name) |
Read from Excel |
pd.read_excel(file_name) |
pl.read_excel(file_name) |
Read from JSON |
pd.read_json(file_name) |
pl.read_json(file_name) |
Read from HDF5 |
pd.read_hdf(file_name) |
— |
Write CSV |
df.to_csv(fname, index=False) |
df.write_csv(fname) |
Rename columns |
df.rename(columns={'c1': 'cat1', 'c2': 'cat2'}) |
df.rename({'c1': 'cat1', 'c2': 'cat2'}) |
Get column 'q1' as series |
df['q1'] or df.get_column('q1') or df.loc[:,'q1'] |
df['q1'] or df.get_column('q1') |
Get column 'q1' as data frame |
df[['q1']] or df.loc[:,['q1']] |
df.select('q1') |
Get columns 'c1' and 'q2' |
df[['c1', 'q2']] or df.loc[:, ['c1', 'q2']] |
df.select('c1', 'q2') |
Get columns containing floats |
df.select_dtypes(float) |
df.select(cs.float()) |
Get row 4 |
df.loc[4, :] |
df.row(4) or df.row(4, named=True) |
Get row 4 as data frame |
df.loc[[4], :] |
df[4] |
Get row where i1 is 2 |
df.loc[df['i1']==2, :] |
df.row(by_predicate=pl.col('i1')==2) or df.filter(pl.col('i1')==2) |
Sub df with rows where c1 is 'a' |
df.loc[df['c1']=='a', :] |
df.filter(pl.col('c1')=='a') |
Sub df where c1 is 'a' and c2 is 'd' |
df.loc[(df['c1']=='a') & (df['c2']=='d'), :] |
df.filter((pl.col('c1')=='a') & (pl.col('c2')=='d')) |
Iterate over columns of df |
for col, s in df.items() |
for s in df |
Iterate over rows of df |
for row_ind, row in df.iterrows() |
for r in df.iter_rows(named=True) |
Group by single column |
df.groupby('c1') |
df.group_by('c1') |
Group by maintaining order |
df.groupby('c1', sort=False) |
df.group_by('c1', maintain_order=True) |
Group by multiple columns |
df.groupby(['c1', 'c2']) |
df.group_by(['c1', 'c2']) |
Iterate over groups |
for group, subdf in df.groupby('c1') |
for (group,), subdf in df.group_by('c1') |
Iterate over nested groups |
for (g1, g2), subdf in df.groupby(['c1', 'c2']) |
for (g1, g2), subdf in df.group_by(['c1', 'c2']) |
Group by and apply mean¹ |
df.groupby('c1').mean(numeric_only=True) |
df.group_by('c1').mean() |
Group by and apply median to one column |
df.groupby('c1')['q1'].median() |
df.group_by('c1').agg(pl.col('q1').median()) |
Group by and apply mean to two columns |
df.groupby('c1')[['q1', 'q2']].mean() |
df.group_by('c1').agg(pl.col('q1', 'q2').mean()) |
Group by and apply custom func to col² |
df.groupby('c1')['q1'].apply(my_fun) |
df.group_by('c1').agg(pl.col('q1').map_batches(my_fun, return_dtype=float, returns_scalar=True)) |
Group by and apply custom func to 2 cols³ |
df.groupby('c1')[['q1', 'q2']].apply(my_fun) |
df.group_by('c1').agg(pl.struct(['q1', 'q2']).map_batches(my_fun, return_dtype=float, returns_scalar=True)) |
Group by and rank within each group |
df.groupby('c1')['q1'].rank() |
df.select(pl.col('q1').rank().over('c1')) |
Convert to tall format |
df.melt(value_name='value', var_name='var', id_vars='id') |
df.unpivot(value_name='value', variable_name='var', index='id') |
Pivot tall result above |
df_tall.pivot(columns='var', index='id').reset_index() |
df_tall.pivot(on='var', index='id') |
Select columns with string in name |
df.filter(regex='q') or df[df.columns[df.columns.str.contains('q')]] |
df.select(cs.contains('q')) |
Add column of zeros to data frame |
df['new_col'] = 0 or df.assign(new_col=0) |
df.with_columns(pl.lit(0).alias('new_col')) |
Add a Numpy array as column |
df['new_col'] = my_array or df.assign(new_col=my_array) |
df.with_columns(pl.Series(my_array).alias('new_col')) |
Multiply two columns; make new column |
df['q1q2'] = df['q1'] * df['q2'] or df.assign(q1q2=df['q1'] * df['q2'] |
df.with_columns((pl.col('q1') * pl.col('q2')).alias('q1q2')) |
Apply a function to each row making new col⁴ |
df.assign(new_col=my_fun) |
df.with_columns(pl.struct(pl.all()).map_elements(my_fun, return_dtype=float).alias('new_col')) |
Drop rows with missing data |
df.dropna() |
df.drop_nulls() |
Sort according to a column |
df.sort_values(by='i1') |
df.sort(by='i1') |
Inner join two data frames⁵ |
pd.merge(df, df2, on=shared_columns) |
df.join(df2, on=shared_columns) |
Concatenate data frames vertically |
pd.concat((df, df2)) |
pl.concat((df, df2), how='diagonal') |
Concatenate data frames horizontally |
pd.concat((df, df2), axis=1) |
pl.concat((df, df2), how='horizontal') |