###############################################################################
# 重要: 请务必把任务(jobs)中需要保存的文件存放在 results 文件夹内
# Important : Please make sure your files are saved to the 'results' folder
# in your jobs
###############################################################################
# sh Miniconda3-latest-Linux-x86_64.sh
# conda create -n poem python==3.10.12
# pip install peft==0.4.0
# pip install transformers
# virtualenv -p miniconda3/envs/poem/bin/python3 poem_env
# source poem_env/bin/activate
# vim ~/.bashrc
# export PATH="/home/jovyan/miniconda3/bin:$PATH"
# source ~/.bashrc
# python -m ipykernel install --user --name=poem
import warnings
warnings.filterwarnings("ignore")
import transformers
import tensorboardX
import peft
import time
import math
import numpy as np
import torch
import torch.nn as nn
from tqdm import tqdm
import transformers
from tensorboardX import SummaryWriter # 这个库提供Tensorboard的日志功能
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import Dataset, DataLoader
from peft import TaskType, LoraConfig, get_peft_model
tokenizer = AutoTokenizer.from_pretrained("IDEA-CCNL/Wenzhong-GPT2-110M")
tokenizer.pad_token = tokenizer.eos_token
class PoemDataset(Dataset):
def __init__(self, path):
super().__init__()
self.poems = open(path, encoding='utf-8').readlines() # [:30000]
def __getitem__(self, idx):
text = self.poems[idx].strip()
return text
def __len__(self):
return len(self.poems)
def collate_fn(samples):
return tokenizer(samples, padding="longest", truncation=True, return_tensors='pt')
def generate_prompt(bs, prompt="写一首唐诗:"):
return tokenizer([prompt] * bs, padding="longest", truncation=True, return_tensors='pt')
@torch.no_grad()
def inference(model):
inputs = tokenizer("写一首唐诗:折戟沉沙铁未销,自将磨洗认前朝。", return_tensors='pt')
inputs = inputs.to(device)
outputs = model.generate(
**inputs,
return_dict_in_generate=True,
max_length=150,
do_sample=True,
top_p=0.6,
num_return_sequences=5
)
print(tokenizer.batch_decode(outputs.sequences, skip_special_tokens=True))
@torch.no_grad()
def evaluate(model, test_loader, epoch, **kwargs):
loss_list = []
ppl_list = []
print("-" * 20 + " Evaluating " + "-" * 20)
model.eval()
with torch.no_grad():
for local_step, inputs in enumerate(tqdm(test_loader)):
# prepare prompts
prompt = generate_prompt(inputs.input_ids.shape[0]).to(device)
inputs = inputs.to(device)
input_ids = torch.cat([prompt.input_ids, inputs.input_ids], dim=1)
no_loss_ids = torch.ones_like(prompt.input_ids) * -100
label_ids = torch.cat([no_loss_ids, inputs.input_ids], dim=1)
attention_mask = torch.cat([prompt.attention_mask, inputs.attention_mask], dim=1)
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask,
labels=label_ids,
return_dict=True,
)
loss_list.append(outputs.loss.cpu().item())
# PPL
probs = torch.softmax(outputs.logits, dim=-1).max(dim=-1)[0] # BLC->BL
ppl = torch.exp(-probs.log().mean(-1))
ppl_list.append(ppl.mean().cpu().item())
# log
avg_loss = sum(loss_list) / len(loss_list)
avg_ppl = sum(ppl_list) / len(ppl_list)
print(f"Epoch {epoch}/{kwargs['max_epochs']} | loss:{avg_loss:.5f} | ppl: {avg_ppl: 5f}")
# writer.add_scalar('Test/Loss', sum(loss_list) / len(loss_list), epoch * kwargs['steps_per_epoch'])
def train(model: nn.Module, dataloaders, optimizer, scheduler, **kwargs):
"""训练的主函数"""
train_loader, test_loader = dataloaders
device = kwargs['device']
writer = SummaryWriter(kwargs['logger_name'])
model = model.to(device)
for epoch in range(kwargs['max_epochs']):
# ========== Train ==========
loss_list = []
model.train()
last_time = time.time()
for local_step, inputs in enumerate(train_loader):
step = epoch * kwargs['steps_per_epoch'] + local_step
# prepare prompts
prompt = generate_prompt(inputs.input_ids.shape[0]).to(device)
inputs = inputs.to(device)
input_ids = torch.cat([prompt.input_ids, inputs.input_ids], dim=1)
no_loss_ids = torch.ones_like(prompt.input_ids) * -100 # Don't cal loss of prompts
label_ids = torch.cat([no_loss_ids, inputs.input_ids], dim=1)
attention_mask = torch.cat([prompt.attention_mask, inputs.attention_mask], dim=1)
optimizer.zero_grad()
with torch.autocast(device_type='cuda'): # fp16
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask,
labels=label_ids,
return_dict=True,
)
loss = outputs.loss
loss.backward()
optimizer.step()
scheduler.step()
# log
loss_list.append(loss.detach().cpu().item())
if (local_step % 50 == 0 and local_step != 0) or local_step == kwargs['steps_per_epoch'] - 1:
avg_loss = sum(loss_list) / len(loss_list)
n_step_time = time.time() - last_time
left_time = (kwargs['steps_per_epoch'] - local_step) // 50 * n_step_time
print("Epoch {}/{} | Step {}/{} | loss:{:.5f} time:{:.1f}s left:{:.1f}m".format(
epoch, kwargs['max_epochs'], local_step, kwargs['steps_per_epoch'],
avg_loss, n_step_time, left_time / 60
))
last_time = time.time()
writer.add_scalar('Train/Loss', loss, step)
writer.add_scalar('Epoch', epoch, step)
# if local_step % 6000 == 0 and local_step != 0:
# evaluate(model, test_loader, epoch * kwargs['steps_per_epoch'] + local_step, **kwargs)
# torch.save(model.named_parameters(), "checkpoint.pth")
model.save_pretrained("./checkpoint_lora_v2")
# ========== Eval ==========
evaluate(model, test_loader, epoch, **kwargs)
print("=" * 53)
# ========== Inference ==========
inference(model)
# tokenizer("写一首唐诗:", padding="longest", truncation=True, return_tensors='pt')
# prepare data
train_dataset = PoemDataset("./data/train_poems_v2.txt")
test_dataset = PoemDataset("./data/test_poems_v2.txt")
# Hyperparameter
batch_size = 16
lr = 1e-4
# device = torch.device('cuda')
device=torch.device("cuda" if torch.cuda.is_available() else"cpu")
max_epochs = 2
num_warmup_steps = 200
num_training_steps = max_epochs * math.ceil(len(train_dataset) / batch_size)
seed = 2023
logger_name = "./logs/1941"
# prepare model
gpt2_model = AutoModelForCausalLM.from_pretrained("IDEA-CCNL/Wenzhong-GPT2-110M")
peft_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
inference_mode=False,
r=16, lora_alpha=16, lora_dropout=0.1, bias='all'
)
gpt2_model = get_peft_model(gpt2_model, peft_config)
gpt2_model.print_trainable_parameters()
# prepare optimizer
optim = torch.optim.AdamW(filter(lambda p: p.requires_grad, gpt2_model.parameters()), lr=lr)
sche = transformers.get_linear_schedule_with_warmup(optim, num_warmup_steps, num_training_steps)
# train
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
train(
gpt2_model,
(train_dataloader, val_dataloader),
optimizer=optim,
scheduler=sche,
device=device,
max_epochs=max_epochs,
logger_name=logger_name,
steps_per_epoch=len(train_dataloader)
)