Quick-start¶
Setup¶
1. Clone the repository:
git clone https://github.com/Guzpenha/transformer_rankers.git
cd transformer_rankers
2. Install the library in a virtual env:
python3 -m venv env
source env/bin/activate
pip install -r requirements.txt
Example¶
Fine tune pointwise BERT for Community Question Answering. Run on Google Colab.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 | from transformers import BertTokenizerFast, BertForSequenceClassification
from transformer_rankers.trainers import transformer_trainer
from transformer_rankers.datasets import dataset, preprocess_crr
from transformer_rankers.negative_samplers import negative_sampling
from transformer_rankers.eval import results_analyses_tools
from transformer_rankers.datasets import downloader
import pandas as pd
import logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[logging.StreamHandler()])
task = 'qqp'
data_folder = "./data/"
logging.info("Starting downloader for task {}".format(task))
dataDownloader = downloader.DataDownloader(task, data_folder)
dataDownloader.download_and_preprocess()
train = pd.read_csv("./data/{}/train.tsv".format(task), sep="\t")
valid = pd.read_csv(data_folder+task+"/valid.tsv", sep="\t")
# Random negative samplers
ns_train = negative_sampling.RandomNegativeSampler(list(train["question1"].values), 1)
ns_val = negative_sampling.RandomNegativeSampler(list(valid["question1"].values) + \
list(train["question1"].values), 1)
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
#Create the loaders for the datasets, with the respective negative samplers
dataloader = dataset.QueryDocumentDataLoader(train_df=train, val_df=valid, test_df=valid,
tokenizer=tokenizer, negative_sampler_train=ns_train,
negative_sampler_val=ns_val, task_type='classification',
train_batch_size=32, val_batch_size=32, max_seq_len=512,
sample_data=-1, cache_path="{}/{}".format(data_folder, task))
train_loader, val_loader, test_loader = dataloader.get_pytorch_dataloaders()
model = BertForSequenceClassification.from_pretrained('bert-base-cased')
#Instantiate trainer that handles fitting.
trainer = transformer_trainer.TransformerTrainer(model=model,train_loader=train_loader,
val_loader=val_loader, test_loader=test_loader,
num_ns_eval=9, task_type="classification", tokenizer=tokenizer,
validate_every_epochs=1, num_validation_batches=-1,
num_epochs=1, lr=0.0005, sacred_ex=None,
validate_every_steps=-1, num_training_instances=1000)
#Train the model
logging.info("Fitting monoBERT for {}".format(task))
trainer.fit()
#Predict for test (in our example the validation set)
logging.info("Predicting")
preds, labels, _ = trainer.test()
res = results_analyses_tools.\
evaluate_and_aggregate(preds, labels, ['ndcg_cut_10'])
for metric, v in res.items():
logging.info("Test {} : {:4f}".format(metric, v))
|