This tutorial demonstrates how to combine Hugging Face with Flower to federate the training of language models across multiple clients without sharing raw data. Specifically, we fine-tune a pre-trained DistilBERT model for sequence classification on the IMDB dataset to detect positive or negative movie reviews.
A complete notebook is available here, which uses Flower's simulation functionality to emulate a federated setting inside Google Colab.
Dependencies
Install the required packages:
pip install datasets evaluate flwr torch transformers
Standard Hugging Face Workflow
Handling the Data
Use Hugging Face's datasets to load the IMDB dataset, tokenize it, and create PyTorch DataLoaders:
import random
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, DataCollatorWithPadding
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
CHECKPOINT = "distilbert-base-uncased"
def load_data():
raw_datasets = load_dataset("imdb")
raw_datasets = raw_datasets.shuffle(seed=42)
del raw_datasets["unsupervised"]
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
def tokenize_function(examples):
return tokenizer(examples["text"], truncation=True)
train_population = random.sample(range(len(raw_datasets["train"])), 100)
test_population = random.sample(range(len(raw_datasets["test"])), 100)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets["train"] = tokenized_datasets["train"].select(train_population)
tokenized_datasets["test"] = tokenized_datasets["test"].select(test_population)
tokenized_datasets = tokenized_datasets.remove_columns("text")
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
trainloader = DataLoader(
tokenized_datasets["train"], shuffle=True, batch_size=32, collate_fn=data_collator
)
testloader = DataLoader(
tokenized_datasets["test"], batch_size=32, collate_fn=data_collator
)
return trainloader, testloader
trainloader, testloader = load_data()
Training and Testing the Model
Define the training and testing loops using standard PyTorch:
from evaluate import load as load_metric
from transformers import AdamW
def train(net, trainloader, epochs):
optimizer = AdamW(net.parameters(), lr=5e-5)
net.train()
for _ in range(epochs):
for batch in trainloader:
batch = {k: v.to(DEVICE) for k, v in batch.items()}
outputs = net(**batch)
loss = outputs.loss
loss.backward()
optimizer.step()
optimizer.zero_grad()
def test(net, testloader):
metric = load_metric("accuracy")
loss = 0
net.eval()
for batch in testloader:
batch = {k: v.to(DEVICE) for k, v in batch.items()}
with torch.no_grad():
outputs = net(**batch)
logits = outputs.logits
loss += outputs.loss.item()
predictions = torch.argmax(logits, dim=-1)
metric.add_batch(predictions=predictions, references=batch["labels"])
loss /= len(testloader.dataset)
accuracy = metric.compute()["accuracy"]
return loss, accuracy
Creating the Model
Load the pre-trained DistilBERT model for sequence classification:
from transformers import AutoModelForSequenceClassification
net = AutoModelForSequenceClassification.from_pretrained(CHECKPOINT, num_labels=2).to(DEVICE)
Federating the Example
Federated Learning trains a model across multiple clients without sharing data. Each client trains locally and sends model parameters to a server, which aggregates them using a strategy. Flower simplifies this process.
Creating the IMDBClient
Define a custom Flower client class inheriting from flwr.client.NumPyClient:
from collections import OrderedDict
import flwr as fl
class IMDBClient(fl.client.NumPyClient):
def get_parameters(self, config):
return [val.cpu().numpy() for _, val in net.state_dict().items()]
def set_parameters(self, parameters):
params_dict = zip(net.state_dict().keys(), parameters)
state_dict = OrderedDict({k: torch.Tensor(v) for k, v in params_dict})
net.load_state_dict(state_dict, strict=True)
def fit(self, parameters, config):
self.set_parameters(parameters)
print("Training Started...")
train(net, trainloader, epochs=1)
print("Training Finished...")
return self.get_parameters(config={}), len(trainloader.dataset), {}
def evaluate(self, parameters, config):
self.set_parameters(parameters)
loss, accuracy = test(net, testloader)
return float(loss), len(testloader.dataset), {"accuracy": float(accuracy)}
Starting the Server
Launch the Flower server and start a client (or multiple clients for true federated learning). In a real deployment, you would run the client code on separate machines.