Add support for AI Image gen

alamshafil · Oct 11, 2024 · eadee2b · eadee2b
1 parent 463d27a
commit eadee2b
Show file tree

Hide file tree

Showing 13 changed files with 322 additions and 90 deletions.
diff --git a/README.md b/README.md
@@ -186,7 +186,7 @@ npx auto-shorts --download
 # You have options to use different AI tools to generate the script, voice, and image
 
 # Use OpenAI gpt-4o-mini to generate the script, ElevenLabs to generate the voice, and Pexels to generate the image
-npx auto-shorts -p "make a news short about TypeScript" --aiType OpenAIGen --ttsType ElevenLabs --imageType PexelsImageGen --elevenLabsAPIKey YOUR_ELEVENLABS_API_KEY --pexelsAPIKey YOUR_PEXELS_API_KEY --openaiAPIKey YOUR_OPENAI_API_KEY
+npx auto-shorts -p "make a news short about TypeScript" --aiType OpenAIGen --ttsType ElevenLabs --imageType Pexels --elevenLabsAPIKey YOUR_ELEVENLABS_API_KEY --pexelsAPIKey YOUR_PEXELS_API_KEY --openaiAPIKey YOUR_OPENAI_API_KEY
 
 # Use local Ollama llama3.2 to generate the script, Built-in TTS to generate the voice, and Google Scraper to generate the image (default, no need to provide API keys)
 npx auto-shorts -p "make a news short about TypeScript"
@@ -229,7 +229,7 @@ const task = await genVideoWithAI(
         tempPath: 'video_temp', // Provide the path to the temporary video folder
         resPath: 'res', // Provide the path to the downloaded resources folder
         voiceGenType: VoiceGenType.ElevenLabsVoice, // Use ElevenLabs to generate the voice
-        imageGenType: ImageGenType.PexelsImageGen, // Use Pexels to generate the image
+        imageGenType: ImageGenType.Pexels, // Use Pexels to generate the image
         apiKeys: {
             elevenLabsAPIKey: process.env.ELEVENLABS_API_KEY, // Provide the ElevenLabs API key
             pexelsAPIKey: process.env.PEXELS_API_KEY, // Provide the Pexels API key
@@ -264,7 +264,7 @@ const task = await genVideoWithJson(
     tempPath: 'video_temp', // Provide the path to the temporary video folder
     resPath: 'res', // Provide the path to the downloaded resources folder
     voiceGenType: VoiceGenType.ElevenLabsVoice, // Use ElevenLabs to generate the voice
-    imageGenType: ImageGenType.PexelsImageGen, // Use Google Scraper to generate the image
+    imageGenType: ImageGenType.Pexels, // Use Pexels to generate the image
     apiKeys: {
         elevenLabsAPIKey: process.env.ELEVENLABS_API_KEY, // Provide the ElevenLabs API key
         pexelsAPIKey: process.env.PEXELS_API_KEY, // Provide the Pexels API key

diff --git a/docs/docs/config.md b/docs/docs/config.md
@@ -26,7 +26,7 @@ npx auto-shorts --help
 
 - `--ttsType [type]`: The TTS provider to use. Can be ElevenLabs, BuiltinTTS, NeetsTTS.
 
-- `--imageType [type]`: The image provider to use. Can be PexelsImageGen, GoogleScraperImageGen.
+- `--imageType [type]`: The image provider to use. Can be Pexels, GoogleScraper, and more.
 
 - `--orientation [orientation]`: The orientation of the video. Options are vertical or horizontal.
 

diff --git a/docs/docs/installation/install-cli.md b/docs/docs/installation/install-cli.md
@@ -29,7 +29,7 @@ You have options to use different AI tools to generate the script, voice, and im
 - Use OpenAI gpt-4o-mini to generate the script, ElevenLabs to generate the voice, and Pexels to generate the image
 
 ```bash
-npx auto-shorts -p "make a news short about TypeScript" --aiType OpenAIGen --ttsType ElevenLabs --imageType PexelsImageGen --elevenLabsAPIKey YOUR_ELEVENLABS_API_KEY --pexelsAPIKey YOUR_PEXELS_API_KEY --openaiAPIKey YOUR_OPENAI_API_KEY
+npx auto-shorts -p "make a news short about TypeScript" --aiType OpenAIGen --ttsType ElevenLabs --imageType Pexels --elevenLabsAPIKey YOUR_ELEVENLABS_API_KEY --pexelsAPIKey YOUR_PEXELS_API_KEY --openaiAPIKey YOUR_OPENAI_API_KEY
 ```
 
 - Use local Ollama llama3.2 to generate the script, Built-in TTS to generate the voice, and Google Scraper to generate the image (default, no need to provide API keys)
@@ -64,8 +64,8 @@ Options
                               GoogleAIGen, AnthropicAIGen, OllamaAIGen.
   --ttsType type              The TTS provider to use. Can be ElevenLabs,       
                               BuiltinTTS, NeetsTTS.
-  --imageType type            The image provider to use. Can be PexelsImageGen, 
-                              GoogleScraperImageGen.
+  --imageType type            The image provider to use. Can be Pexels,
+                              GoogleScraper, FluxAI.
   --orientation orientation   The orientation of the video. (vertical,
                               horizontal)
   --tempPath path             The temporary path to save video files. (default: 
@@ -76,6 +76,12 @@ Options
                               Overrides AI.
   -h, --help                  Print this usage guide.
 
+Image Options
+
+  --imgAIModel model     AI model to use for image generation. If applicable.
+  --imgAIPrompt prompt   AI suffix prompt to use for image generation. If       
+                         applicable.
+
 Subtitle Options
 
   --subtitleLen number      Subtitle token length override.

diff --git a/docs/docs/installation/install-js.md b/docs/docs/installation/install-js.md
@@ -31,7 +31,7 @@ const task = await genVideoWithAI(
         tempPath: 'video_temp', // Provide the path to the temporary video folder
         resPath: 'res', // Provide the path to the downloaded resources folder
         voiceGenType: VoiceGenType.ElevenLabsVoice, // Use ElevenLabs to generate the voice
-        imageGenType: ImageGenType.PexelsImageGen, // Use Pexels to generate the image
+        imageGenType: ImageGenType.Pexels, // Use Pexels to generate the image
         apiKeys: {
             elevenLabsAPIKey: process.env.ELEVENLABS_API_KEY, // Provide the ElevenLabs API key
             pexelsAPIKey: process.env.PEXELS_API_KEY, // Provide the Pexels API key
@@ -65,7 +65,7 @@ const task = await genVideoWithJson(
     tempPath: 'video_temp', // Provide the path to the temporary video folder
     resPath: 'res', // Provide the path to the downloaded resources folder
     voiceGenType: VoiceGenType.ElevenLabsVoice, // Use ElevenLabs to generate the voice
-    imageGenType: ImageGenType.PexelsImageGen, // Use Google Scraper to generate the image
+    imageGenType: ImageGenType.Pexels, // Use Google Scraper to generate the image
     apiKeys: {
         elevenLabsAPIKey: process.env.ELEVENLABS_API_KEY, // Provide the ElevenLabs API key
         pexelsAPIKey: process.env.PEXELS_API_KEY, // Provide the Pexels API key

diff --git a/docs/static/img/ui.png b/docs/static/img/ui.png
diff --git a/src/cli.ts b/src/cli.ts
@@ -185,6 +185,19 @@ async function cli() {
         }
     ];
 
+    const imgOptions = [
+        {
+            name: 'imgAIModel',
+            typeLabel: '{underline model}',
+            description: 'AI model to use for image generation. {italic If applicable.}'
+        },
+        {
+            name: 'imgAIPrompt',
+            typeLabel: '{underline prompt}',
+            description: 'AI suffix prompt to use for image generation. {italic If applicable.}'
+        }
+    ]
+
     const subOptions = [
         {
             name: 'subtitleLen',
@@ -227,6 +240,10 @@ async function cli() {
             header: 'Options',
             optionList: mainOptions
         },
+        {
+            header: 'Image Options',
+            optionList: imgOptions
+        },
         {
             header: 'Subtitle Options',
             optionList: subOptions
@@ -271,7 +288,7 @@ async function cli() {
 
     let aiType: string = options.aiType ?? AIGenType.OllamaAIGen;
     let ttsType: string = options.ttsType ?? VoiceGenType.BuiltinTTS;
-    let imageType: string = options.imageType ?? ImageGenType.GoogleScraperImageGen;
+    let imageType: string = options.imageType ?? ImageGenType.GoogleScraper;
 
     // Check if type is valid
     if (!(aiType in AIGenType)) {
@@ -302,6 +319,10 @@ async function cli() {
     let noBgVideo = options.noBgVideo ?? false;
     let noBgMusic = options.noBgMusic ?? false;
 
+    // AI Image options
+    let imgAIModel = options.imgAIModel ?? null;
+    let imgAIPrompt = options.imgAIPrompt ?? null;
+
     // Subtitle options
     let subtitleLen = options.subtitleLen ?? null;
     let subFontName = options.subFontName ?? null;
@@ -357,6 +378,10 @@ async function cli() {
     console.info("Res path: " + resPath);
     console.info("Prompt: " + (userPrompt ?? "None (will be asked later)"));
 
+    console.log("\n--> Image AI options:");
+    console.info("Image AI model: " + (imgAIModel ?? "Default"));
+    console.info("Image AI prompt: " + (imgAIPrompt ?? "Default"));
+
     console.log("\n--> Subtitle options:");
     console.info("Subtitle length: " + (subtitleLen ?? "Default"));
     console.info("Subtitle font name: " + (subFontName ?? "Default"));
@@ -393,7 +418,7 @@ async function cli() {
         return;
     }
 
-    if (imageType == ImageGenType.PexelsImageGen && !pexelsAPIKey) {
+    if (imageType == ImageGenType.GoogleScraper && !pexelsAPIKey) {
         console.error("Error: Pexels API key not found. Exiting...");
         return;
     }
@@ -473,6 +498,11 @@ async function cli() {
                 console.info("Use background music: " + !jsonData.noBgMusic);
                 console.info("AI Model: " + jsonData.model);
 
+                // Log previous image options
+                console.info("--> Using previous image options:");
+                console.info("Image AI model: " + (jsonData.imgAIModel ?? "Default"));
+                console.info("Image AI prompt: " + (jsonData.imgAIPrompt ?? "Default"));
+
                 // Log previous subtitle options
                 console.info("--> Using previous subtitles options:");
                 console.info("Subtitle length: " + (jsonData.subtitleLen ?? "Default"));
@@ -496,6 +526,10 @@ async function cli() {
                 noBgMusic = jsonData.noBgMusic;
                 aiModel = jsonData.model;
 
+                // Set previous image options
+                imgAIModel = jsonData.imgAIModel;
+                imgAIPrompt = jsonData.imgAIPrompt;
+
                 // Set previous subtitle options
                 subtitleLen = jsonData.subtitleLen;
                 subFontName = jsonData.subFontName;
@@ -595,6 +629,15 @@ async function cli() {
             }
         }
 
+        // Ask for image AI options
+        console.info("[*] Asking for image AI options (leave empty for default):");
+
+        const imgAIModelRep = await input({ message: `Image AI model? -> ` });
+        const imgAIPromptRep = await input({ message: `Image AI prompt? -> ` });
+
+        imgAIModel = imgAIModelRep !== "" ? imgAIModelRep : imgAIModel;
+        imgAIPrompt = imgAIPromptRep !== "" ? imgAIPromptRep : imgAIPrompt;
+
         // Ask for subtitle options (if empty then keep null or current value)
         console.info("[*] Asking for subtitle options (leave empty for default):");
 
@@ -627,6 +670,11 @@ async function cli() {
         console.info("Use background music: " + !noBgMusic);
         console.info("AI Model: " + aiModel);
 
+        // Print image AI options
+        console.info("--> Image AI options:");
+        console.info("Image AI model: " + (imgAIModel ?? "Default"));
+        console.info("Image AI prompt: " + (imgAIPrompt ?? "Default"));
+
         // Print subtitle options
         console.info("--> Subtitle options:");
         console.info("Subtitle length: " + (subtitleLen ?? "Default"));
@@ -637,6 +685,7 @@ async function cli() {
 
         // Save to file
         const data = {
+            // Main options
             aiType: aiType,
             ttsType: ttsType,
             imageType: imageType,
@@ -649,6 +698,10 @@ async function cli() {
             noBgVideo: noBgVideo,
             noBgMusic: noBgMusic,
             aiModel: aiModel,
+            // Image options
+            imgAIModel: imgAIModel,
+            imgAIPrompt: imgAIPrompt,
+            // Subtitle options
             subtitleLen: subtitleLen,
             subFontName: subFontName,
             subFontSize: subFontSize,
@@ -706,6 +759,9 @@ async function cli() {
             maxLen: subtitleLen, fontName: subFontName, fontSize: subFontSize,
             fontColor: subFontColor, strokeColor: subStrokeColor, strokeWidth: subStrokeWidth
         },
+        imageOptions: {
+            modelName: imgAIModel, suffixPrompt: imgAIPrompt
+        },
         internalOptions: {
             debug: true,
             changePhotos: changePhotos, disableTTS: disableTTS, useMock: useMock, disableSubtitles: disableSubtitles

diff --git a/src/image.ts b/src/image.ts
@@ -11,8 +11,9 @@ import path from "path";
  * Image generation types
  */
 export enum ImageGenType {
-    PexelsImageGen = "PexelsImageGen",
-    GoogleScraperImageGen = "GoogleScraperImageGen",
+    Pexels = "Pexels",
+    GoogleScraper = "GoogleScraper",
+    FluxAI = "FluxAI",
 }
 
 /**
@@ -22,12 +23,35 @@ export enum ImageAPIEnv {
     PexelsAPIKey = "PEXELS_API_KEY",
 }
 
+/**
+ * Image style types
+ */
+export enum ImageStyleType {
+    /** AI generated images */
+    AI = "AI",
+    /** Real images */
+    Search = "Search",
+}
+
+/**
+ * Options for AI image generation
+ */
+export interface AIImageGenOptions {
+    /** Model name */
+    modelName: string;
+    /** Additional prompt */
+    suffixPrompt: string;
+}
+
 /**
  * Base class for image generation
  * @abstract
  */
 export class ImageGen {
-    static async generateImages(gen: VideoGen, images: string[], tempPath: string, changePhotos: boolean) : Promise<string[]> {
+    /** Image style type */
+    styleType: ImageStyleType = ImageStyleType.Search;
+
+    static async generateImages(gen: VideoGen, images: string[], tempPath: string, changePhotos: boolean): Promise<string[]> {
         throw new Error("Method 'generateImage' must be implemented");
     }
 }
@@ -36,6 +60,8 @@ export class ImageGen {
  * Image generation using Pexels API
  */
 export class PexelsImageGen extends ImageGen {
+    /** Image style type */
+    public static styleType: ImageStyleType = ImageStyleType.Search;
 
     /**
      * Generate images using Pexels API
@@ -46,40 +72,40 @@ export class PexelsImageGen extends ImageGen {
      * @param filePrefix - File prefix for images
      * @returns List of image paths
      */
-    static async generateImages(gen: VideoGen, images: string[], tempPath: string, changePhotos: boolean, apiKey?: string, filePrefix?: string) : Promise<string[]> {
+    static async generateImages(gen: VideoGen, images: string[], tempPath: string, changePhotos: boolean, apiKey?: string, filePrefix?: string): Promise<string[]> {
         if (!apiKey) {
             throw new Error("Pexels API key required");
         }
 
         const client = new Client({ apiKey: apiKey });
         const imgs: string[] = [];
-    
+
         for (const [index, _] of images.entries()) {
             if (changePhotos) {
                 const query = images[index];
                 gen.log(`Searching for images for rank ${index + 1} with query: ${query}`);
-    
+
                 const r_images_rep = await client.v1.photos.search(query, { perPage: 1, page: 1 });
                 const r_image1 = r_images_rep.photos[0].src.large;
-    
+
                 // Download images with axios
                 const r_image_path = path.join(tempPath, `image-${filePrefix ?? index}.png`);
-    
+
                 const image_response = await axios.get(r_image1, { responseType: 'arraybuffer' });
                 fs.writeFileSync(r_image_path, image_response.data);
-    
+
                 imgs.push(r_image_path);
-    
+
                 gen.log(`Image for rank ${index + 1} downloaded successfully at ${r_image_path}`);
             } else {
                 const r_image_path = path.join(tempPath, `image-${filePrefix ?? index}.png`);
-    
+
                 imgs.push(r_image_path);
-        
+
                 gen.log(`Image for rank ${index + 1} downloaded successfully at ${r_image_path}`);
             }
         }
-        
+
         return imgs;
     }
 }
@@ -88,6 +114,8 @@ export class PexelsImageGen extends ImageGen {
  * Image generation using Google
  */
 export class GoogleScraperImageGen extends ImageGen {
+    /** Image style type */
+    public static styleType: ImageStyleType = ImageStyleType.Search;
 
     /**
      * Generate images using Google
@@ -97,7 +125,7 @@ export class GoogleScraperImageGen extends ImageGen {
      * @param filePrefix - File prefix for images
      * @returns List of image paths
      */
-    static async generateImages(gen: VideoGen, images: string[], tempPath: string, changePhotos: boolean, filePrefix?: string) : Promise<string[]> {
+    static async generateImages(gen: VideoGen, images: string[], tempPath: string, changePhotos: boolean, filePrefix?: string): Promise<string[]> {
         const imgs: string[] = [];
 
         if (changePhotos) {
@@ -122,7 +150,7 @@ export class GoogleScraperImageGen extends ImageGen {
 
         return imgs;
     }
-    
+
     private static async imgScrape(queries: string[]) {
         try {
             const browser = await puppeteer.launch({ headless: true });
@@ -166,3 +194,25 @@ export class GoogleScraperImageGen extends ImageGen {
         }
     }
 }
+
+/**
+ * Image generation using Flux AI
+ */
+export class FluxAIImageGen extends ImageGen {
+    /** Image style type */
+    public static styleType: ImageStyleType = ImageStyleType.AI;
+
+    /**
+     * Generate images using Flux AI
+     * @param images - List of image queries
+     * @param tempPath - Temporary path to save images
+     * @param changePhotos - Change photos or not
+     * @param filePrefix - File prefix for images
+     * @param aiOptions - AI image generation options
+     * @returns List of image paths
+     */
+    static async generateImages(gen: VideoGen, images: string[], tempPath: string, changePhotos: boolean, aiOptions?: AIImageGenOptions, filePrefix?: string): Promise<string[]> {
+        throw Error("Flux AI image generation not implemented yet");
+        return [];
+    }
+}