Hugging Face Transformersを使ってネガポジを判定するモデルを作ってみました。
query | title | label |
---|---|---|
negaposi | この映画は本当に面白い | 0 |
みたいな形で教師を作り、それを投入して学習させました。 東北大学の日本語 BERT モデルを事前学習モデルとし、 それをSequence Classificationさせました。
モデリング自体は、Google Colaboratoryを用いて実行しました。
学習
!pip install transformers[ja]==4.3.3 torch==1.9 sentencepiece==0.1.91
from google.colab import drive
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertJapaneseTokenizer, BertForSequenceClassification, BertForMaskedLM, pipeline, Trainer, TrainingArguments
import torch
drive.mount('/content/drive')
training_data = pd.read_csv('/content/drive/MyDrive/Texts/negaposi-sentence.csv')
training_data.head()
print(len(training_data["query"].unique()))
training_data[["title", "label"]].groupby("label").count()
train_queries, val_queries, train_docs, val_docs, train_labels, val_labels = train_test_split(
training_data["query"].tolist(),
training_data["title"].tolist(),
training_data["label"].tolist(),
test_size=.5
)
model_name = 'cl-tohoku/bert-base-japanese-whole-word-masking'
tokenizer = BertJapaneseTokenizer.from_pretrained(model_name)
train_encodings = tokenizer(train_queries, train_docs, truncation=True, padding='max_length', max_length=128)
val_encodings = tokenizer(val_queries, val_docs, truncation=True, padding='max_length', max_length=128)
model_name = 'cl-tohoku/bert-base-japanese-whole-word-masking'
tokenizer = BertJapaneseTokenizer.from_pretrained(model_name)
train_encodings = tokenizer(train_queries, train_docs, truncation=True, padding='max_length', max_length=128)
val_encodings = tokenizer(val_queries, val_docs, truncation=True, padding='max_length', max_length=128)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels = 2)
for param in model.base_model.parameters():
param.requires_grad = False
training_args = TrainingArguments(
logging_steps=10,
output_dir='models',
evaluation_strategy="epoch",
num_train_epochs=2000,
per_device_train_batch_size=16,
per_device_eval_batch_size=64,
warmup_steps=500,
weight_decay=0.01,
save_total_limit=1,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset
)
trainer.train()
trainer.save_model(output_dir='/content/drive/MyDrive/Models/sentiment-mining4')
推論
!pip install transformers[ja]==4.3.3 torch==1.9 sentencepiece==0.1.91
from google.colab import drive
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertJapaneseTokenizer, BertForSequenceClassification, BertForMaskedLM, pipeline, Trainer, TrainingArguments
import torch
drive.mount('/content/drive')
model_name = 'cl-tohoku/bert-base-japanese-whole-word-masking'
tokenizer = BertJapaneseTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained('/content/drive/MyDrive/Models/sentiment-mining4')
nlp = pipeline("sentiment-analysis",model=model,tokenizer=tokenizer)
nlp("この本は興味深い")
nlp(“この本は興味深い”)
nlp("この本は酷い")
[{’label’: ‘LABEL_1’, ‘score’: 0.5989319086074829}]
nlp("この本はつまらない")
[{’label’: ‘LABEL_1’, ‘score’: 0.6405262351036072}]
nlp("この映画やばくね")
[{’label’: ‘LABEL_1’, ‘score’: 0.6612663269042969}]
nlp("この映画とっても面白いんだけど")
[{’label’: ‘LABEL_0’, ‘score’: 0.5494323372840881}]
nlp("この映画とっても面白い")
[{’label’: ‘LABEL_1’, ‘score’: 0.5648174285888672}]