pandas vs FiftyOne#

This cheat sheet shows how to translate common pandas operations into FiftyOne.

Nomenclature#

pandas

FiftyOne

DataFrame (df)

Dataset (ds)

Row

Sample

Column

Field

Getting started#

pandas

FiftyOne

Importing the packages

import pandas as pd

import fiftyone as fo

Create empty dataset

df = pd.DataFrame()

ds = fo.Dataset()

Load dataset

df = pd.read_csv(*)

ds = fo.Dataset.from_dir(*)

Basics#

pandas

FiftyOne

First row/sample

df.iloc[0] or df.head(1)

ds.first() or ds.head(1)

Last row/sample

df.iloc[-1] or df.tail(1)

ds.last() or ds.tail(1)

First few rows/samples

df.head()

ds.head()

Last few rows/samples

df.tail()

ds.tail()

Get specific row/sample

df.loc[j]

ds[sample_id]

Number of rows/samples

len(df)

len(ds)

Column names/field schema

df.columns

ds.get_field_schema()

Get all values in column/field

df[*].tolist()

ds.values(*)

View stages#

pandas

FiftyOne

Make a copy

df.copy()

ds.clone()

Slice

df[start:end]

ds[start:end]

Random sample

df.sample(n=n)

ds.take(n)

Shuffle data

df.sample(frac=1)

ds.shuffle()

Filter by column/field value

df[df[*] > threshold]

ds.match(F(*) > threshold)

Sort values

df.sort_values()

ds.sort_by(*)

Delete all

import gc
del df; gc.collect()

ds.delete()

Aggregations#

pandas

FiftyOne

Count

df[*].count()

ds.count(*)

Sum

df[*].sum()

ds.sum(*)

Unique values

df[*].unique()

ds.distinct(*)

Bounds

min = df[*].min()
max = df[*].max()

min, max = ds.bounds(*)

Mean

df[*].mean()

ds.mean(*)

Standard deviation

df[*].std()

ds.std(*)

Quantile

df[*].quantile(values)

ds.quantiles(*, values))

Structural changes#

pandas

FiftyOne

New column/field as constant value

df["col"] = value

ds.add_sample_field("field", fo.StringField)
ds.set_field("field", value).save()

New column/field from external data

df["col"] = data

ds.set_values("field", data)

New column/field from existing columns/fields

df["col"] = df.apply(fcn, axis=1)

ds.add_sample_field("field", fo.FloatField)
ds.set_field("field", expression).save()

Remove a column/field

df = df.drop(["col"], axis=1)

ds.delete_sample_fields(["field"]) or
ds.exclude_fields(["field"]).keep_fields()

Keep only specified columns/fields

df["col1", "col2"]

ds.select_fields(["field1", "field2"])

Concatenate DataFrames or DatasetViews

pd.concat([df1, df2])

view1.concat(view2)

Add a single row/sample

df.append(row, ignore_index=True)

ds.add_sample(sample)

Remove rows/samples

df.drop(rows)

ds.delete_samples(sample_ids) or
ds.exclude(samples).keep()

Keep only specified rows/samples

df.iloc[rows]

ds.select(sample_ids)

Rename column/field

df.rename(columns={"old": "new"})

ds.rename_sample_field("old", "new")

Expressions#

pandas

FiftyOne

Exact equality

df[df[*] == value]

ds.match(F(*) == value)

Less than or equal to

new_df = df[df[*] <= value]

new_view = ds.match(F(*) <= value)

Logical complement

new_df = df[~(df[*] <= value)]

new_view = ds.match(~(F(*) <= value))

Logical AND

df[pd_cond1 & pd_cond2]

ds.match(fo_cond1 & fo_cond2)

Logical OR

df[pd_cond1 | pd_cond2]

ds.match(fo_cond1 | fo_cond2)

Is in

df[*].isin(cols)

ds.filter_labels(*, F("label").is_in(fields))

Contains string

df[*].str.contains(substr)

ds.filter_labels(*, F("label").contains_str(substr))

Check for numerics

pdt.is_numeric_dtype(df[*])

isinstance(ds.get_field_schema()[*], (fo.FloatField, fo.IntField)) or
len(ds.match(F(*).is_number())) > 0

Check for strings

pdt.is_string_dtype(df[*])

isinstance(ds.get_field_schema()[*], fo.StringField) or
len(ds.match(F(*).is_string())) > 0

Check for null entries

df.isna().any()

len(ds.match(F(*) == None)) > 0

Note

The table above assumes you have imported:

import pandas.api.types as pdt
from fiftyone import ViewField as F