Stable Diffusion (Diffusers) / Diffusersで77トークンを超えるプロンプトを使用する
通常では77トークンを超えるプロンプトを使用すると切り捨てされてしまいます。それを回避する方法を説明します
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens:
Compel
テキスト プロンプトに重み付けおよびブレンドするライブラリであるCompelを使用することで77トークンを超えるプロンプトを使用できるようにします
https://github.com/damian0815/compel
truncate_long_prompts=Falseを設定します
from compel import Compel, DiffusersTextualInversionManager
textual_inversion_manager = DiffusersTextualInversionManager(pipe)
compel = Compel(tokenizer=pipe.tokenizer, text_encoder=pipe.text_encoder,
textual_inversion_manager=textual_inversion_manager,
truncate_long_prompts=False,
device=device)
#positive_embeds = compel.build_conditioning_tensor(prompt)
#negative_embeds = compel.build_conditioning_tensor(negative_prompt)
positive_embeds = compel([prompt])
negative_embeds = compel([negative_prompt])
[positive_embeds, negative_embeds] = compel.pad_conditioning_tensors_to_same_length([concat_tensor(positive_embeds), concat_tensor(negative_embeds)])
https://github.com/damian0815/compel/issues/59
from compel import Compel, DiffusersTextualInversionManager
def concat_tensor(t):
t_list = torch.split(t, 1, dim=0)
t = torch.cat(t_list, dim=1)
return t
def detokenize(chunk, actual_prompt):
chunk[-1] = chunk[-1].replace('</w>', '')
chanked_prompt = ''.join(chunk).strip()
while '</w>' in chanked_prompt:
if actual_prompt[chanked_prompt.find('</w>')] == ' ':
chanked_prompt = chanked_prompt.replace('</w>', ' ', 1)
else:
chanked_prompt = chanked_prompt.replace('</w>', '', 1)
actual_prompt = actual_prompt.replace(chanked_prompt,'')
return chanked_prompt.strip(), actual_prompt.strip()
def tokenize_line(line, tokenizer): # split into chunks
actual_prompt = line.lower().strip()
actual_tokens = tokenizer.tokenize(actual_prompt)
max_tokens = tokenizer.model_max_length - 2
separators = {
'comma': tokenizer.tokenize(',')[0],
'dot': tokenizer.tokenize('.')[0],
'colon': tokenizer.tokenize(':')[0]
}
chunks = []
chunk = []
for item in actual_tokens:
chunk.append(item)
if len(chunk) == max_tokens:
if chunk[-1] not in list(separators.values()):
for i in range(max_tokens-1, -1, -1):
if chunk[i] in list(separators.values()):
actual_chunk, actual_prompt = detokenize(chunk[:i+1], actual_prompt)
chunks.append(actual_chunk)
chunk = chunk[i+1:]
break
else:
actual_chunk, actual_prompt = detokenize(chunk, actual_prompt)
chunks.append(actual_chunk)
chunk = []
else:
actual_chunk, actual_prompt = detokenize(chunk, actual_prompt)
chunks.append(actual_chunk)
chunk = []
if chunk:
actual_chunk, _ = detokenize(chunk, actual_prompt)
chunks.append(actual_chunk)
return chunks
textual_inversion_manager = DiffusersTextualInversionManager(pipe)
compel = Compel(tokenizer=pipe.tokenizer, text_encoder=pipe.text_encoder,
textual_inversion_manager=textual_inversion_manager,
truncate_long_prompts=False,
device=device)
#positive_embeds = compel.build_conditioning_tensor(prompt)
#negative_embeds = compel.build_conditioning_tensor(negative_prompt)
#positive_embeds = compel([prompt])
#negative_embeds = compel([negative_prompt])
positive_embeds = compel(tokenize_line(prompt, pipe.tokenizer))
negative_embeds = compel(tokenize_line(negative_prompt, pipe.tokenizer))
[positive_embeds, negative_embeds] = compel.pad_conditioning_tensors_to_same_length([concat_tensor(positive_embeds), concat_tensor(negative_embeds)])
prompt_embeds,negative_prompt_embedsに切り替えます
image = pipe(
prompt_embeds=positive_embeds,
negative_prompt_embeds=negative_embeds,
# prompt=prompt,
# negative_prompt=negative_prompt,
image=init_img,
width=width, height=height, generator=generator,
controlnet_conditioning_scale=controlnet_conditioning_scale,
guidance_scale=CFG_scale, num_inference_steps=Steps,
guess_mode=guess_mode
).images[0]
ディスカッション
コメント一覧
まだ、コメントがありません