基于BERT实现文本分类 二分类 情感分类

avatar 2024年04月26日19:25:07 0 133 views
博主分享免费Java教学视频,B站账号:Java刘哥

直接贴代码

一、模型定义和训练

# 1. 导入包
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd


# 2. 定义数据集类
def load_imdb_data(data_file):
    df = pd.read_csv(data_file)[:1000]  # 从csv文件中读取数据,节约时间,取前1000条
    texts = df['review'].tolist()
    labels = [1 if sentiment == "positive" else 0 for sentiment in df['sentiment'].tolist()]
    return texts, labels


data_file = "K:/workspace-sync/datasets/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv"
texts, labels = load_imdb_data(data_file)
print(f"Number of samples: {len(texts)}")


# 3. 自定义数据集类
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length  #

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length',
                                  truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(),
                'label': torch.tensor(label)}


# 4. 自定义BERT分类器
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits


# 5. 定义训练函数
def train(model, data_loader, optimizer, scheduler, device):
    model.train()  # 设置模型为训练模式
    for batch in data_loader:
        optimizer.zero_grad()  # 梯度清零
        input_ids = batch['input_ids'].to(device)  # input_ids是输入文本的编码, 有batch_size个文本,每个文本的长度为max_length
        attention_mask = batch['attention_mask'].to(device)  # attention_mask是输入文本的mask
        labels = batch['label'].to(device)  # labels是输入文本的标签
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)  # 模型输出
        loss = nn.CrossEntropyLoss()(outputs, labels)  # 计算交叉熵损失
        loss.backward()  # 反向传播
        optimizer.step()  # 更新参数
        scheduler.step()  # 更新学习率


# 6. 定义评估函数
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask) # 模型输出
            _, preds = torch.max(outputs, dim=1) # 求最大值、最大值的索引
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)


# 8. 定义模型参数
bert_model_name = 'K:/workspace-sync/models/bert-base-uncased'

batch_size = 2  # 每次处理的样本数量
max_length = 128  # 每个样本的维度,少于128,则填0
# 所以每个输入文本的维度是:[batch_size, max_length],即[16, 128]

num_classes = 2  # 分类数, 输出为[16, 2]

num_epochs = 4  # 训练轮数作用:1. 控制训练时间 2. 控制模型性能
learning_rate = 2e-5  # 学习率作用:1. 控制模型参数更新的速度 2. 控制模型性能

# 9. 加载和切分数据
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# 10.  初始化分词器,数据集,数据加载器
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

# 11. 设置设备和模型
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)

# 12. 设置优化器和学习率调度器
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# 13. 训练模型
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train(model, train_dataloader, optimizer, scheduler, device)
    accuracy, report = evaluate(model, val_dataloader, device)
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(report)

# 保存模型
torch.save(model, "model/bert_classifier.pth")

 

二、测试

# 1. 导入包
import pandas as pd
import torch
from torch import nn
from transformers import BertTokenizer, BertModel

# 2. 自定义BERT分类器
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits


# 3. 定义预测函数
def predict_sentiment(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
    return preds.item()


# 4. 预测
# test_text = "The movie was great and I really enjoyed the performances of the actors."
model = torch.load("model/bert_classifier.pth")
bert_model_name = 'K:/workspace-sync/models/bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# 2. 定义数据集类
def load_imdb_data(data_file):
    df = pd.read_csv(data_file)[1000:2000]  # 从csv文件中读取数据, 读取1000-2000行
    texts = df['review'].tolist()
    labels = [1 if sentiment == "positive" else 0 for sentiment in df['sentiment'].tolist()]
    return texts, labels


data_file = "K:/workspace-sync/datasets/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv"
test_texts, test_labels = load_imdb_data(data_file)
count_correct = 0
for i in range(len(test_texts)):
    text = test_texts[i]
    label = test_labels[i]
    sentiment = predict_sentiment(text, model, tokenizer, device)
    print('label:', label)
    print('predit:', sentiment)
    if (label == sentiment):
        count_correct += 1
print('accuracy:', count_correct / len(test_texts))

 

 

补充

1、下载 bert-base-uncased

在 google上搜索,能下载到

还是贴下链接吧:

https://huggingface.co/google-bert/bert-base-uncased

2、下载数据集

https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

 

 

参考文章:https://medium.com/@khang.pham.exxact/text-classification-with-bert-7afaacc5e49b

  • 微信
  • 交流学习,有偿服务
  • weinxin
  • 博客/Java交流群
  • 资源分享,问题解决,技术交流。群号:590480292
  • weinxin
avatar

发表评论

avatar 登录者:匿名
匿名评论,评论回复后会有邮件通知

  

已通过评论:0   待审核评论数:0