Stable Diffusion (Diffusers) / Diffusersで77トークンを超えるプロンプトを使用する

通常では77トークンを超えるプロンプトを使用すると切り捨てされてしまいます。それを回避する方法を説明します

The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens:

Compel

テキスト プロンプトに重み付けおよびブレンドするライブラリであるCompelを使用することで77トークンを超えるプロンプトを使用できるようにします
https://github.com/damian0815/compel

truncate_long_prompts=Falseを設定します
from compel import Compel, DiffusersTextualInversionManager

textual_inversion_manager = DiffusersTextualInversionManager(pipe)
compel = Compel(tokenizer=pipe.tokenizer, text_encoder=pipe.text_encoder,
textual_inversion_manager=textual_inversion_manager,
    truncate_long_prompts=False,
    device=device)
#positive_embeds = compel.build_conditioning_tensor(prompt)
#negative_embeds = compel.build_conditioning_tensor(negative_prompt)
positive_embeds = compel([prompt])
negative_embeds = compel([negative_prompt])

[positive_embeds, negative_embeds] = compel.pad_conditioning_tensors_to_same_length([concat_tensor(positive_embeds), concat_tensor(negative_embeds)])

https://github.com/damian0815/compel/issues/59

from compel import Compel, DiffusersTextualInversionManager

def concat_tensor(t):
    t_list = torch.split(t, 1, dim=0)
    t = torch.cat(t_list, dim=1)
    return t


def detokenize(chunk, actual_prompt):
    chunk[-1] = chunk[-1].replace('</w>', '')
    chanked_prompt = ''.join(chunk).strip()
    while '</w>' in chanked_prompt:
        if actual_prompt[chanked_prompt.find('</w>')] == ' ':
            chanked_prompt = chanked_prompt.replace('</w>', ' ', 1)
        else:
            chanked_prompt = chanked_prompt.replace('</w>', '', 1)
    actual_prompt = actual_prompt.replace(chanked_prompt,'')
    return chanked_prompt.strip(), actual_prompt.strip()

def tokenize_line(line, tokenizer): # split into chunks
    actual_prompt = line.lower().strip()
    actual_tokens = tokenizer.tokenize(actual_prompt)
    max_tokens = tokenizer.model_max_length - 2
    separators = {
        'comma': tokenizer.tokenize(',')[0],
        'dot': tokenizer.tokenize('.')[0],
        'colon': tokenizer.tokenize(':')[0]
    }

    chunks = []
    chunk = []
    for item in actual_tokens:
        chunk.append(item)
        if len(chunk) == max_tokens:
            if chunk[-1] not in list(separators.values()):
                for i in range(max_tokens-1, -1, -1):
                    if chunk[i] in list(separators.values()):
                        actual_chunk, actual_prompt = detokenize(chunk[:i+1], actual_prompt)
                        chunks.append(actual_chunk)
                        chunk = chunk[i+1:]
                        break
                else:
                    actual_chunk, actual_prompt = detokenize(chunk, actual_prompt)
                    chunks.append(actual_chunk)
                    chunk = []
            else:
                actual_chunk, actual_prompt = detokenize(chunk, actual_prompt)
                chunks.append(actual_chunk)
                chunk = []
    if chunk:
        actual_chunk, _ = detokenize(chunk, actual_prompt)
        chunks.append(actual_chunk)

    return chunks

textual_inversion_manager = DiffusersTextualInversionManager(pipe)
compel = Compel(tokenizer=pipe.tokenizer, text_encoder=pipe.text_encoder,
textual_inversion_manager=textual_inversion_manager,
    truncate_long_prompts=False,
    device=device)
#positive_embeds = compel.build_conditioning_tensor(prompt)
#negative_embeds = compel.build_conditioning_tensor(negative_prompt)
#positive_embeds = compel([prompt])
#negative_embeds = compel([negative_prompt])

positive_embeds = compel(tokenize_line(prompt, pipe.tokenizer))
negative_embeds = compel(tokenize_line(negative_prompt, pipe.tokenizer))
[positive_embeds, negative_embeds] = compel.pad_conditioning_tensors_to_same_length([concat_tensor(positive_embeds), concat_tensor(negative_embeds)])
            prompt_embeds,negative_prompt_embedsに切り替えます

     image = pipe(
                prompt_embeds=positive_embeds,
                negative_prompt_embeds=negative_embeds,
#                prompt=prompt,
 #               negative_prompt=negative_prompt,
                image=init_img,
                width=width, height=height, generator=generator,
                controlnet_conditioning_scale=controlnet_conditioning_scale,
                guidance_scale=CFG_scale, num_inference_steps=Steps,
                guess_mode=guess_mode
                ).images[0]
 

stable-diffusion

Posted by eightban