master
/ Jiayi_GPT2_v2.py

Jiayi_GPT2_v2.py @master raw · history · blame

###############################################################################
# 重要: 请务必把任务(jobs)中需要保存的文件存放在 results 文件夹内
# Important : Please make sure your files are saved to the 'results' folder
# in your jobs
###############################################################################
# sh Miniconda3-latest-Linux-x86_64.sh
# conda create -n poem python==3.10.12
# pip install peft==0.4.0
# pip install transformers
# virtualenv -p miniconda3/envs/poem/bin/python3 poem_env
# source poem_env/bin/activate
# vim ~/.bashrc
# export PATH="/home/jovyan/miniconda3/bin:$PATH"
# source ~/.bashrc
# python -m ipykernel install --user --name=poem
import warnings
warnings.filterwarnings("ignore")
import transformers
import tensorboardX
import peft
import time
import math

import numpy as np
import torch
import torch.nn as nn
from tqdm import tqdm
import transformers
from tensorboardX import SummaryWriter  # 这个库提供Tensorboard的日志功能
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import Dataset, DataLoader
from peft import TaskType, LoraConfig, get_peft_model

tokenizer = AutoTokenizer.from_pretrained("IDEA-CCNL/Wenzhong-GPT2-110M")
tokenizer.pad_token = tokenizer.eos_token
class PoemDataset(Dataset):
    def __init__(self, path):
        super().__init__()
        self.poems = open(path, encoding='utf-8').readlines()  # [:30000]

    def __getitem__(self, idx):
        text = self.poems[idx].strip()
        return text

    def __len__(self):
        return len(self.poems)


def collate_fn(samples):
    return tokenizer(samples, padding="longest", truncation=True, return_tensors='pt')


def generate_prompt(bs, prompt="写一首唐诗:"):
    return tokenizer([prompt] * bs, padding="longest", truncation=True, return_tensors='pt')


@torch.no_grad()
def inference(model):
    inputs = tokenizer("写一首唐诗:折戟沉沙铁未销,自将磨洗认前朝。", return_tensors='pt')
    inputs = inputs.to(device)
    outputs = model.generate(
        **inputs,
        return_dict_in_generate=True,
        max_length=150,
        do_sample=True,
        top_p=0.6,
        num_return_sequences=5
    )
    print(tokenizer.batch_decode(outputs.sequences, skip_special_tokens=True))


@torch.no_grad()
def evaluate(model, test_loader, epoch, **kwargs):
    loss_list = []
    ppl_list = []
    print("-" * 20 + "   Evaluating   " + "-" * 20)
    model.eval()
    with torch.no_grad():
        for local_step, inputs in enumerate(tqdm(test_loader)):
            # prepare prompts
            prompt = generate_prompt(inputs.input_ids.shape[0]).to(device)
            inputs = inputs.to(device)
            input_ids = torch.cat([prompt.input_ids, inputs.input_ids], dim=1)
            no_loss_ids = torch.ones_like(prompt.input_ids) * -100
            label_ids = torch.cat([no_loss_ids, inputs.input_ids], dim=1)
            attention_mask = torch.cat([prompt.attention_mask, inputs.attention_mask], dim=1)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=label_ids,
                return_dict=True,
            )
            loss_list.append(outputs.loss.cpu().item())
            # PPL
            probs = torch.softmax(outputs.logits, dim=-1).max(dim=-1)[0]  # BLC->BL
            ppl = torch.exp(-probs.log().mean(-1))
            ppl_list.append(ppl.mean().cpu().item())
        # log
        avg_loss = sum(loss_list) / len(loss_list)
        avg_ppl = sum(ppl_list) / len(ppl_list)
        print(f"Epoch {epoch}/{kwargs['max_epochs']} | loss:{avg_loss:.5f} | ppl: {avg_ppl: 5f}")
        # writer.add_scalar('Test/Loss', sum(loss_list) / len(loss_list), epoch * kwargs['steps_per_epoch'])


def train(model: nn.Module, dataloaders, optimizer, scheduler, **kwargs):
    """训练的主函数"""
    train_loader, test_loader = dataloaders
    device = kwargs['device']
    writer = SummaryWriter(kwargs['logger_name'])
    model = model.to(device)

    for epoch in range(kwargs['max_epochs']):
        # ==========  Train  ==========
        loss_list = []
        model.train()
        last_time = time.time()
        for local_step, inputs in enumerate(train_loader):
            step = epoch * kwargs['steps_per_epoch'] + local_step
            # prepare prompts
            prompt = generate_prompt(inputs.input_ids.shape[0]).to(device)
            inputs = inputs.to(device)
            input_ids = torch.cat([prompt.input_ids, inputs.input_ids], dim=1)
            no_loss_ids = torch.ones_like(prompt.input_ids) * -100  # Don't cal loss of prompts
            label_ids = torch.cat([no_loss_ids, inputs.input_ids], dim=1)
            attention_mask = torch.cat([prompt.attention_mask, inputs.attention_mask], dim=1)

            optimizer.zero_grad()

            with torch.autocast(device_type='cuda'):  # fp16
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=label_ids,
                    return_dict=True,
                )
                loss = outputs.loss

            loss.backward()
            optimizer.step()
            scheduler.step()

            # log
            loss_list.append(loss.detach().cpu().item())
            if (local_step % 50 == 0 and local_step != 0) or local_step == kwargs['steps_per_epoch'] - 1:
                avg_loss = sum(loss_list) / len(loss_list)
                n_step_time = time.time() - last_time
                left_time = (kwargs['steps_per_epoch'] - local_step) // 50 * n_step_time
                print("Epoch {}/{} | Step {}/{} | loss:{:.5f} time:{:.1f}s left:{:.1f}m".format(
                    epoch, kwargs['max_epochs'], local_step, kwargs['steps_per_epoch'],
                    avg_loss, n_step_time, left_time / 60
                ))
                last_time = time.time()
            writer.add_scalar('Train/Loss', loss, step)
            writer.add_scalar('Epoch', epoch, step)

            # if local_step % 6000 == 0 and local_step != 0:
            #     evaluate(model, test_loader, epoch * kwargs['steps_per_epoch'] + local_step, **kwargs)
        # torch.save(model.named_parameters(), "checkpoint.pth")
        model.save_pretrained("./checkpoint_lora_v2")
        # ==========  Eval  ==========
        evaluate(model, test_loader, epoch, **kwargs)
        print("=" * 53)
        # ==========  Inference  ==========
        inference(model)

# tokenizer("写一首唐诗:", padding="longest", truncation=True, return_tensors='pt')
# prepare data
train_dataset = PoemDataset("./data/train_poems_v2.txt")
test_dataset = PoemDataset("./data/test_poems_v2.txt")

# Hyperparameter
batch_size = 16
lr = 1e-4
# device = torch.device('cuda')
device=torch.device("cuda" if torch.cuda.is_available() else"cpu")
max_epochs = 2
num_warmup_steps = 200
num_training_steps = max_epochs * math.ceil(len(train_dataset) / batch_size)
seed = 2023
logger_name = "./logs/1941"

# prepare model
gpt2_model = AutoModelForCausalLM.from_pretrained("IDEA-CCNL/Wenzhong-GPT2-110M")
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=16, lora_alpha=16, lora_dropout=0.1, bias='all'
)
gpt2_model = get_peft_model(gpt2_model, peft_config)
gpt2_model.print_trainable_parameters()

# prepare optimizer
optim = torch.optim.AdamW(filter(lambda p: p.requires_grad, gpt2_model.parameters()), lr=lr)
sche = transformers.get_linear_schedule_with_warmup(optim, num_warmup_steps, num_training_steps)

# train
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

train(
    gpt2_model,
    (train_dataloader, val_dataloader),
    optimizer=optim,
    scheduler=sche,
    device=device,
    max_epochs=max_epochs,
    logger_name=logger_name,
    steps_per_epoch=len(train_dataloader)
)