master
/ Jiayi_GPT2_v2.ipynb

Jiayi_GPT2_v2.ipynb @masterview markup · raw · history · blame

Notebook

使用conda创建环境

In [ ]:
# sh Miniconda3-latest-Linux-x86_64.sh
# conda create -n poem python==3.10.12
# pip install peft==0.4.0
# pip install transformers

使用virtual创建环境

In [ ]:
# virtualenv -p miniconda3/envs/poem/bin/python3 poem_env
# source poem_env/bin/activate

恢复conda命令,记得重启后activate

In [ ]:
# vim ~/.bashrc
# export PATH="/home/jovyan/miniconda3/bin:$PATH"
# source ~/.bashrc

创建虚拟环境的kernel

In [ ]:
# python -m ipykernel install --user --name=poem
In [1]:
import warnings
warnings.filterwarnings("ignore")
In [2]:
import transformers
import tensorboardX
import peft
In [3]:
import time
import math

import numpy as np
import torch
import torch.nn as nn
from tqdm import tqdm
import transformers
from tensorboardX import SummaryWriter  # 这个库提供Tensorboard的日志功能
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import Dataset, DataLoader
from peft import TaskType, LoraConfig, get_peft_model

tokenizer = AutoTokenizer.from_pretrained("IDEA-CCNL/Wenzhong-GPT2-110M")
tokenizer.pad_token = tokenizer.eos_token
In [4]:
class PoemDataset(Dataset):
    def __init__(self, path):
        super().__init__()
        self.poems = open(path, encoding='utf-8').readlines()  # [:30000]

    def __getitem__(self, idx):
        text = self.poems[idx].strip()
        return text

    def __len__(self):
        return len(self.poems)


def collate_fn(samples):
    return tokenizer(samples, padding="longest", truncation=True, return_tensors='pt')


def generate_prompt(bs, prompt="写一首唐诗:"):
    return tokenizer([prompt] * bs, padding="longest", truncation=True, return_tensors='pt')


@torch.no_grad()
def inference(model):
    inputs = tokenizer("写一首唐诗:折戟沉沙铁未销,自将磨洗认前朝。", return_tensors='pt')
    inputs = inputs.to(device)
    outputs = model.generate(
        **inputs,
        return_dict_in_generate=True,
        max_length=150,
        do_sample=True,
        top_p=0.6,
        num_return_sequences=5
    )
    print(tokenizer.batch_decode(outputs.sequences, skip_special_tokens=True))


@torch.no_grad()
def evaluate(model, test_loader, epoch, **kwargs):
    loss_list = []
    ppl_list = []
    print("-" * 20 + "   Evaluating   " + "-" * 20)
    model.eval()
    with torch.no_grad():
        for local_step, inputs in enumerate(tqdm(test_loader)):
            # prepare prompts
            prompt = generate_prompt(inputs.input_ids.shape[0]).to(device)
            inputs = inputs.to(device)
            input_ids = torch.cat([prompt.input_ids, inputs.input_ids], dim=1)
            no_loss_ids = torch.ones_like(prompt.input_ids) * -100
            label_ids = torch.cat([no_loss_ids, inputs.input_ids], dim=1)
            attention_mask = torch.cat([prompt.attention_mask, inputs.attention_mask], dim=1)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=label_ids,
                return_dict=True,
            )
            loss_list.append(outputs.loss.cpu().item())
            # PPL
            probs = torch.softmax(outputs.logits, dim=-1).max(dim=-1)[0]  # BLC->BL
            ppl = torch.exp(-probs.log().mean(-1))
            ppl_list.append(ppl.mean().cpu().item())
        # log
        avg_loss = sum(loss_list) / len(loss_list)
        avg_ppl = sum(ppl_list) / len(ppl_list)
        print(f"Epoch {epoch}/{kwargs['max_epochs']} | loss:{avg_loss:.5f} | ppl: {avg_ppl: 5f}")
        # writer.add_scalar('Test/Loss', sum(loss_list) / len(loss_list), epoch * kwargs['steps_per_epoch'])


def train(model: nn.Module, dataloaders, optimizer, scheduler, **kwargs):
    """训练的主函数"""
    train_loader, test_loader = dataloaders
    device = kwargs['device']
    writer = SummaryWriter(kwargs['logger_name'])
    model = model.to(device)

    for epoch in range(kwargs['max_epochs']):
        # ==========  Train  ==========
        loss_list = []
        model.train()
        last_time = time.time()
        for local_step, inputs in enumerate(train_loader):
            step = epoch * kwargs['steps_per_epoch'] + local_step
            # prepare prompts
            prompt = generate_prompt(inputs.input_ids.shape[0]).to(device)
            inputs = inputs.to(device)
            input_ids = torch.cat([prompt.input_ids, inputs.input_ids], dim=1)
            no_loss_ids = torch.ones_like(prompt.input_ids) * -100  # Don't cal loss of prompts
            label_ids = torch.cat([no_loss_ids, inputs.input_ids], dim=1)
            attention_mask = torch.cat([prompt.attention_mask, inputs.attention_mask], dim=1)

            optimizer.zero_grad()

            with torch.autocast(device_type='cuda'):  # fp16
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=label_ids,
                    return_dict=True,
                )
                loss = outputs.loss

            loss.backward()
            optimizer.step()
            scheduler.step()

            # log
            loss_list.append(loss.detach().cpu().item())
            if (local_step % 50 == 0 and local_step != 0) or local_step == kwargs['steps_per_epoch'] - 1:
                avg_loss = sum(loss_list) / len(loss_list)
                n_step_time = time.time() - last_time
                left_time = (kwargs['steps_per_epoch'] - local_step) // 50 * n_step_time
                print("Epoch {}/{} | Step {}/{} | loss:{:.5f} time:{:.1f}s left:{:.1f}m".format(
                    epoch, kwargs['max_epochs'], local_step, kwargs['steps_per_epoch'],
                    avg_loss, n_step_time, left_time / 60
                ))
                last_time = time.time()
            writer.add_scalar('Train/Loss', loss, step)
            writer.add_scalar('Epoch', epoch, step)

            # if local_step % 6000 == 0 and local_step != 0:
            #     evaluate(model, test_loader, epoch * kwargs['steps_per_epoch'] + local_step, **kwargs)
        # torch.save(model.named_parameters(), "checkpoint.pth")
        model.save_pretrained("checkpoint_lora_v2")
        # ==========  Eval  ==========
        evaluate(model, test_loader, epoch, **kwargs)
        print("=" * 53)
        # ==========  Inference  ==========
        inference(model)

# tokenizer("写一首唐诗:", padding="longest", truncation=True, return_tensors='pt')
In [ ]:
# prepare data
train_dataset = PoemDataset("data/train_poems_v2.txt")
test_dataset = PoemDataset("data/test_poems_v2.txt")

# Hyperparameter
batch_size = 16
lr = 1e-4
# device = torch.device('cuda')
device=torch.device("cuda" if torch.cuda.is_available() else"cpu")
max_epochs = 2
num_warmup_steps = 200
num_training_steps = max_epochs * math.ceil(len(train_dataset) / batch_size)
seed = 2023
logger_name = "logs"

# prepare model
gpt2_model = AutoModelForCausalLM.from_pretrained("IDEA-CCNL/Wenzhong-GPT2-110M")
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=16, lora_alpha=16, lora_dropout=0.1, bias='all'
)
gpt2_model = get_peft_model(gpt2_model, peft_config)
gpt2_model.print_trainable_parameters()

# prepare optimizer
optim = torch.optim.AdamW(filter(lambda p: p.requires_grad, gpt2_model.parameters()), lr=lr)
sche = transformers.get_linear_schedule_with_warmup(optim, num_warmup_steps, num_training_steps)

# train
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

train(
    gpt2_model,
    (train_dataloader, val_dataloader),
    optimizer=optim,
    scheduler=sche,
    device=device,
    max_epochs=max_epochs,
    logger_name=logger_name,
    steps_per_epoch=len(train_dataloader)
)
trainable params: 691,968 || all params: 125,065,728 || trainable%: 0.5532834702725274
In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria
from peft import PeftModel
import torch
import os
import re

class ChineseCharacterStop(StoppingCriteria):
    def __init__(self, chars: list[str]):
        self.chars = [
            tokenizer(i, add_special_tokens=False, return_tensors='pt').input_ids
            for i in chars
        ]
        # for chars, tokens in zip(chars, self.chars):
        #     print(f"'{chars}':{tokens}")

    def __call__(self, input_ids: torch.LongTensor,
                 scores: torch.FloatTensor, **kwargs) -> bool:
        for c in self.chars:
            c = c.to(input_ids.device)
            match = torch.eq(input_ids[..., -c.shape[1]:], c)
            if torch.any(torch.all(match, dim=1)):
                return True
        return False


tokenizer = AutoTokenizer.from_pretrained("IDEA-CCNL/Wenzhong-GPT2-110M")
tokenizer.pad_token = tokenizer.eos_token
gpt2_model = AutoModelForCausalLM.from_pretrained("IDEA-CCNL/Wenzhong-GPT2-110M")
model = PeftModel.from_pretrained(gpt2_model, 'checkpoint_lora_v4.1')


def cang_tou(tou: str):
    poem_now = "写一首唐诗:"
    for c in tou:
        poem_now += c
        print(poem_now)
        inputs = tokenizer(poem_now, return_tensors='pt')
        outputs = model.generate(
            **inputs,
            return_dict_in_generate=True,
            max_length=150,
            do_sample=True,
            top_p=0.4,
            num_beams=1,
            num_return_sequences=1,
            stopping_criteria=[ChineseCharacterStop(['。', ','])],
            pad_token_id=tokenizer.pad_token_id
        )
        poem_now = tokenizer.batch_decode(outputs.sequences, skip_special_tokens=True)[0]
    print(poem_now)
    return poem_now[6:]


def prompt_gen(prompt):
    inputs = tokenizer(prompt, return_tensors='pt')
    outputs = model.generate(
        **inputs,
        return_dict_in_generate=True,
        max_length=200,
        do_sample=True,
        top_p=0.8,
        num_beams=5,
        num_return_sequences=3,
        # stopping_criteria=[ChineseCharacterStop(['。', ',', ''])],
        pad_token_id=tokenizer.pad_token_id
    )
    res = ''
    for line in tokenizer.batch_decode(outputs.sequences, skip_special_tokens=True):
        line = line[len(prompt):]
        res = res+line+'\n'
    return res

def handle(conf):
    prompt = conf["prompt"]
    if conf['mode']=="藏头诗":
        res = cang_tou(prompt)
    if conf['mode']=="根据提示生成古诗":
        res = prompt_gen(prompt)
    return {"result": res}

res = cang_tou("一见如故")
print(res)
写一首唐诗:一
写一首唐诗:一江春水碧,见
写一首唐诗:一江春水碧,见我岸边白。如
写一首唐诗:一江春水碧,见我岸边白。如今长绿碧,故
写一首唐诗:一江春水碧,见我岸边白。如今长绿碧,故园有香火。
一江春水碧,见我岸边白。如今长绿碧,故园有香火。
In [1]:
from gradio_client import Client

client = Client("https://wendyy-poem-generate.hf.space/")

def handle(conf):
    prompt = conf["prompt"]
    if conf['mode']=="藏头诗":
        result = client.predict(
                    prompt,	# str  in 'Prompt' Textbox component
                    fn_index=0
                )
    if conf['mode']=="根据提示生成古诗":
        result = client.predict(
                    prompt,	# str  in '藏头诗' Textbox component
                    fn_index=1
                )
    return {"result": result}
Loaded as API: https://wendyy-poem-generate.hf.space/ ✔
In [3]:
result = client.predict(
                "一见如故",	# str  in 'Prompt' Textbox component
                fn_index=1)
In [4]:
result
Out[4]:
'一曲江汉江,见解知音渺。如何细看江,故人何处寻。'
In [ ]: