As proposed in our paper, the "Words of a Thousand Pictures" metric (W1KP) measures perceptual variability for sets of images in text-to-image generation, bootstrapped from existing perceptual distances such as DreamSim.
-
Install PyTorch for your Python 3.10+ environment.
-
Install W1KP:
pip install w1kp
-
Download the calibration data file.
-
You're done!
We recommend
import asyncio
import torch
from w1kp import StableDiffusionXLImageGenerator, DreamSimDistanceMeasure, query_inverted_cdf
async def amain():
# Generate 10 SDXL images for a prompt
prompt = 'cat'
images = []
image_gen = StableDiffusionXLImageGenerator()
for seed in range(10):
ret = await image_gen.generate_image(prompt, seed=seed)
images.append(ret['image'])
# Compute and normalize the W1KP score
dreamsim_l2 = DreamSimDistanceMeasure().to_listwise()
w1kp_score = dreamsim_l2.measure(images)
cdf_x, cdf_y = torch.load('cdf-xy.pt') # download this data file from the repo
dist = dreamsim_l2.measure(prompt, images)
dist = query_inverted_cdf(cdf_x, cdf_y, dist) # normalize to U[0, 1]
w1kp_score = 1 - dist # invert for the W1KP score
for im in images:
im.show()
print(f'The W1KP score for the images are {w1kp_score}')
if __name__ == '__main__':
asyncio.run(amain())
@article{tang2024w1kp,
title={Words Worth a Thousand Pictures: Measuring and Understanding Perceptual Variability in Text-to-Image Generation},
author={Tang, Raphael and Zhang, Xinyu and Xu, Lixinyu and Lu, Yao and Li, Wenyan and Stenetorp, Pontus and Lin, Jimmy and Ture, Ferhan},
journal={arXiv:2210.04885},
year={2024}
}