Transformers

RinnaStableDiffusion

Rinnaから日本語対応のStable Diffusionが出たのでをGoogle Colab上で使ってみました。コードとしては以下のような形です。bashのコードはJupyterから投げます。 pip install gradio try: from japanese_stable_diffusion import JapaneseStableDiffusionPipeline except: res = subprocess.run(['pip', 'install', 'git+https://github.com/rinnakk/japanese-stable-diffusion'], stdout=subprocess.PIPE).stdout.decode('utf-8') print(res) from japanese_stable_diffusion import JapaneseStableDiffusionPipeline import torch from torch import autocast from diffusers import LMSDiscreteScheduler from PIL import Image from IPython import display import gradio as gr def make_grid_from_pils(pil_images): w, h = pil_images[0].size grid_img = Image.new("RGB", ((len(pil_images)) * w, h)) for idx, image in enumerate(pil_images): grid_img.paste(image, (idx * w, 0)) return grid_img from huggingface_hub import notebook_login notebook_login() model_id = "rinna/japanese-stable-diffusion" device = "cuda" if torch.cuda.is_available() else "cpu" # Use the K-LMS scheduler here instead scheduler = LMSDiscreteScheduler( beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000 ) pipe = JapaneseStableDiffusionPipeline.from_pretrained( pretrained_model_name_or_path=model_id, scheduler=scheduler, torch_dtype=torch.float16, use_auth_token=True ).to(device) #@markdown ###**Inference Setting:** # the number of output images. If you encounter Out Of Memory error, decrease this number. n_samples = 1 #@param{type: 'integer'} # `classifier-free guidance scale` adjusts how much the image will be like your prompt. Higher values keep your image closer to your prompt. guidance_scale = 7.5 #@param {type:"number"} # How many steps to spend generating (diffusing) your image. steps = 50 #@param{type: 'integer'} # The width of the generated image. width = 512 #@param{type: 'integer'} # The height of the generated image. height = 512 #@param{type: 'integer'} # The seed used to generate your image. Enable to manually set a seed. seed = 'random' #@param{type: 'string'} import torch from torch import autocast from diffusers import LMSDiscreteScheduler from japanese_stable_diffusion import JapaneseStableDiffusionPipeline model_id = "rinna/japanese-stable-diffusion" device = "cuda" # Use the K-LMS scheduler here instead scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000) pipe = JapaneseStableDiffusionPipeline.from_pretrained(model_id, scheduler=scheduler, use_auth_token=True) pipe = pipe.to(device) prompt = "富士山をバックに二大スーパーロボットががっちりと握手" with autocast("cuda"): image = pipe(prompt, guidance_scale=7.5)["sample"][0] image.save("output.png") image “富士山をバックに二大スーパーロボットががっちりと握手"から画像を作成し、以下のような画像になります。 ...

StableDiffusion

Stable DiffusionをGoogle Colab上で使ってみました。コードとしては以下のような形です。bashのコードはJupyterから投げます。 pip install diffusers==0.2.4 pip install transformers scipy ftfy pip install "ipywidgets>=7,<8" from google.colab import output output.enable_custom_widget_manager() from huggingface_hub import notebook_login notebook_login() import torch from diffusers import StableDiffusionPipeline # make sure you're logged in with `huggingface-cli login` pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16, use_auth_token=True) pipe = pipe.to("cuda") from torch import autocast prompt = "a galaxy far from earth" with autocast("cuda"): image = pipe(prompt)["sample"][0] # image here is in [PIL format](https://pillow.readthedocs.io/en/stable/) # Now to display an image you can do either save it such as: image.save(f"galaxy_far_from_earth.png") # or if you're in a google colab you can directly display it with image “a galaxy far from earth"から画像を作成し、以下のような画像になります。 ...

Transformersによる文書の分類

Hugging Face Transformersを使ってネガポジを判定するモデルを作ってみました。 query title label negaposi この映画は本当に面白い 0 みたいな形で教師を作り、それを投入して学習させました。東北大学の日本語 BERT モデルを事前学習モデルとし、それをSequence Classificationさせました。モデリング自体は、Google Colaboratoryを用いて実行しました。学習 !pip install transformers[ja]==4.3.3 torch==1.9 sentencepiece==0.1.91 from google.colab import drive import pandas as pd from sklearn.model_selection import train_test_split from transformers import BertJapaneseTokenizer, BertForSequenceClassification, BertForMaskedLM, pipeline, Trainer, TrainingArguments import torch drive.mount('/content/drive') training_data = pd.read_csv('/content/drive/MyDrive/Texts/negaposi-sentence.csv') training_data.head() print(len(training_data["query"].unique())) training_data[["title", "label"]].groupby("label").count() train_queries, val_queries, train_docs, val_docs, train_labels, val_labels = train_test_split( training_data["query"].tolist(), training_data["title"].tolist(), training_data["label"].tolist(), test_size=.5 ) model_name = 'cl-tohoku/bert-base-japanese-whole-word-masking' tokenizer = BertJapaneseTokenizer.from_pretrained(model_name) train_encodings = tokenizer(train_queries, train_docs, truncation=True, padding='max_length', max_length=128) val_encodings = tokenizer(val_queries, val_docs, truncation=True, padding='max_length', max_length=128) model_name = 'cl-tohoku/bert-base-japanese-whole-word-masking' tokenizer = BertJapaneseTokenizer.from_pretrained(model_name) train_encodings = tokenizer(train_queries, train_docs, truncation=True, padding='max_length', max_length=128) val_encodings = tokenizer(val_queries, val_docs, truncation=True, padding='max_length', max_length=128) model = BertForSequenceClassification.from_pretrained(model_name, num_labels = 2) for param in model.base_model.parameters(): param.requires_grad = False training_args = TrainingArguments( logging_steps=10, output_dir='models', evaluation_strategy="epoch", num_train_epochs=2000, per_device_train_batch_size=16, per_device_eval_batch_size=64, warmup_steps=500, weight_decay=0.01, save_total_limit=1, ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset ) trainer.train() trainer.save_model(output_dir='/content/drive/MyDrive/Models/sentiment-mining4') 推論 !pip install transformers[ja]==4.3.3 torch==1.9 sentencepiece==0.1.91 from google.colab import drive import pandas as pd from sklearn.model_selection import train_test_split from transformers import BertJapaneseTokenizer, BertForSequenceClassification, BertForMaskedLM, pipeline, Trainer, TrainingArguments import torch drive.mount('/content/drive') model_name = 'cl-tohoku/bert-base-japanese-whole-word-masking' tokenizer = BertJapaneseTokenizer.from_pretrained(model_name) model = BertForSequenceClassification.from_pretrained('/content/drive/MyDrive/Models/sentiment-mining4') nlp = pipeline("sentiment-analysis",model=model,tokenizer=tokenizer) nlp("この本は興味深い") nlp(“この本は興味深い”) ...