""" Audiobook Creator Copyright (C) 2025 Prakhar Sharma This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . """ import shutil from openai import OpenAI, AsyncOpenAI from tqdm import tqdm import json import os import asyncio import re import tempfile from word2number import w2n import time import sys from pydub import AudioSegment from utils.run_shell_commands import check_if_ffmpeg_is_installed, check_if_calibre_is_installed from utils.file_utils import read_json, empty_directory from utils.audiobook_utils import merge_chapters_to_m4b, convert_audio_file_formats, add_silence_to_audio_file_by_reencoding_using_ffmpeg, merge_chapters_to_standard_audio_file, add_silence_to_audio_file_by_appending_pre_generated_silence, assemble_chapter_with_ffmpeg, add_silence_to_chapter_with_ffmpeg, get_ebook_metadata_with_cover, validate_file_path from utils.check_if_audio_generator_api_is_up import check_if_audio_generator_api_is_up from utils.voice_mapping import get_narrator_and_dialogue_voices, get_voice_for_character_score, get_narrator_voice_for_character from utils.text_preprocessing import preprocess_text_for_tts from utils.llm_utils import generate_audio_with_retry from dotenv import load_dotenv load_dotenv() TTS_BASE_URL = os.environ.get("TTS_BASE_URL", "http://localhost:8880/v1") TTS_API_KEY = os.environ.get("TTS_API_KEY", "not-needed") TTS_MODEL = os.environ.get("TTS_MODEL", "kokoro") TTS_MAX_PARALLEL_REQUESTS_BATCH_SIZE = int(os.environ.get("TTS_MAX_PARALLEL_REQUESTS_BATCH_SIZE", 1)) os.makedirs("audio_samples", exist_ok=True) os.makedirs("generated_audiobooks", exist_ok=True) async_openai_client = AsyncOpenAI( base_url=TTS_BASE_URL, api_key=TTS_API_KEY ) def sanitize_filename(text): # Remove or replace problematic characters text = text.replace("'", '').replace('"', '').replace('/', ' ').replace('.', ' ') text = text.replace(':', '').replace('?', '').replace('\\', '').replace('|', '') text = text.replace('*', '').replace('<', '').replace('>', '').replace('&', 'and') # cleanup file name based on pattern in run_shell_command_secure # ^[a-zA-Z0-9\-_./]+$ regex = r"[^a-zA-Z0-9\-_./\s]" text = re.sub(regex, ' ', text, 0, re.MULTILINE) # Normalize whitespace and trim text = ' '.join(text.split()) return text def is_only_punctuation(text): """ Check if a line contains only punctuation marks without any actual words. This helps avoid TTS errors when encountering lines with just punctuation. Args: text (str): The text line to check Returns: bool: True if the line contains only punctuation, False otherwise """ # Remove all whitespace cleaned_text = text.strip() # If empty after stripping, it's not useful for TTS if not cleaned_text: return True # Import string for standard punctuation import string # Extended punctuation set including common Unicode punctuation in books extended_punctuation = string.punctuation + '—–""''…‚„‹›«»‰‱' # Remove all punctuation marks (both ASCII and extended Unicode) text_without_punct = ''.join(char for char in cleaned_text if char not in extended_punctuation) # If nothing remains after removing punctuation, it's only punctuation return len(text_without_punct.strip()) == 0 def split_and_annotate_text(text): """Splits text into dialogue and narration while annotating each segment.""" parts = re.split(r'("[^"]+")', text) # Keep dialogues in the split result annotated_parts = [] for part in parts: if part: # Ignore empty strings annotated_parts.append({ "text": part, "type": "dialogue" if part.startswith('"') and part.endswith('"') else "narration" }) return annotated_parts def check_if_chapter_heading(text): """ Checks if a given text line represents a chapter heading. A chapter heading is considered a string that starts with either "Chapter", "Part", or "PART" (case-insensitive) followed by a number (either a digit or a word that can be converted to an integer). :param text: The text to check :return: True if the text is a chapter heading, False otherwise """ pattern = r'^(Chapter|Part|PART)\s+([\w-]+|\d+)' regex = re.compile(pattern, re.IGNORECASE) match = regex.match(text) if match: label, number = match.groups() try: # Try converting the number (either digit or word) to an integer w2n.word_to_num(number) if not number.isdigit() else int(number) return True except ValueError: return False # Invalid number format return False # No match def find_voice_for_gender_score(character: str, character_gender_map, engine_name: str, narrator_gender: str): """ Finds the appropriate voice for a character based on their gender score using the new voice mapping system. This function takes in the name of a character, a dictionary mapping character names to their gender scores, the TTS engine name, and the narrator gender preference. It returns the voice identifier that matches the character's gender score within the appropriate score map (male_score_map or female_score_map). Args: character (str): The name of the character for whom the voice is being determined. character_gender_map (dict): A dictionary mapping character names to their gender scores. engine_name (str): The TTS engine name ("kokoro" or "orpheus"). narrator_gender (str): User's narrator gender preference ("male" or "female"). Returns: str: The voice identifier that matches the character's gender score. """ # Handle narrator character specially if character.lower() == "narrator": return get_narrator_voice_for_character(engine_name, narrator_gender) # Get the character's gender score if "scores" in character_gender_map and character.lower() in character_gender_map["scores"]: character_info = character_gender_map["scores"][character.lower()] character_gender_score = character_info["gender_score"] return get_voice_for_character_score(engine_name, narrator_gender, character_gender_score) else: # Fallback for unknown characters - use score 5 (neutral) return get_voice_for_character_score(engine_name, narrator_gender, 5) def validate_book_for_m4b_generation(book_path): """ Validates that the book file is suitable for M4B audiobook generation. This function performs early validation to catch issues before audio generation: - Checks if the book file path is safe and accessible - Verifies that ebook-meta command is available - Tests metadata extraction from the book - Ensures cover image can be extracted Args: book_path (str): Path to the book file Returns: tuple: (is_valid, error_message, metadata) - is_valid (bool): True if validation passed - error_message (str): Error description if validation failed, None if passed - metadata (dict): Extracted metadata if successful, None if failed """ try: # Validate file path safety and existence if not validate_file_path(book_path): return False, f"Invalid or inaccessible book file: {book_path}. Please check the file path and permissions.", None # Test metadata extraction (this also validates ebook-meta availability) metadata = get_ebook_metadata_with_cover(book_path) # Check if we got meaningful metadata if not metadata or len(metadata) == 0: return False, f"No metadata could be extracted from the book file: {book_path}. Please ensure it's a valid ebook format.", None # Check if cover extraction worked (cover.jpg should exist after get_ebook_metadata_with_cover) if not validate_file_path("cover.jpg"): return False, f"Could not extract cover image from the book file: {book_path}. The book may not contain a cover image.", None return True, None, metadata except ValueError as e: return False, f"Book file validation error: {str(e)}", None except RuntimeError as e: return False, f"Ebook processing error: {str(e)}. Please ensure Calibre is properly installed and the book file is not corrupted.", None except Exception as e: return False, f"Unexpected error during book validation: {str(e)}", None async def generate_audio_with_single_voice(output_format, narrator_gender, generate_m4b_audiobook_file=False, book_path="", add_emotion_tags=False): # Read the text from the file """ Generate an audiobook using a single voice for narration and dialogues. This asynchronous function reads text from a file, processes each line to determine if it is narration or dialogue, and generates corresponding audio using specified voices. The generated audio is organized by chapters, with options to create an M4B audiobook file or a standard audio file in the specified output format. Args: output_format (str): The desired output format for the final audiobook (e.g., "mp3", "wav"). narrator_gender (str): The gender of the narrator ("male" or "female") to select appropriate voices. generate_m4b_audiobook_file (bool, optional): Flag to determine whether to generate an M4B file. Defaults to False. book_path (str, optional): The file path for the book to be used in M4B creation. Defaults to an empty string. add_emotion_tags (bool, optional): Whether to use pre-applied emotion tags in the audiobook. Defaults to False. Yields: str: Progress updates as the audiobook generation progresses through loading text, generating audio, organizing by chapters, assembling chapters, and post-processing steps. """ # Early validation for M4B generation if generate_m4b_audiobook_file: yield "Validating book file for M4B audiobook generation..." is_valid, error_message, metadata = validate_book_for_m4b_generation(book_path) if not is_valid: raise ValueError(f"❌ Book validation failed: {error_message}") yield f"✅ Book validation successful! Title: {metadata.get('Title', 'Unknown')}, Author: {metadata.get('Author(s)', 'Unknown')}" # Check if emotion tags should be used and if they have been pre-applied if add_emotion_tags and os.path.exists("tag_added_lines_chunks.txt"): with open("tag_added_lines_chunks.txt", "r", encoding='utf-8') as f: text = f.read() yield "Using pre-processed text with emotion tags" else: with open("converted_book.txt", "r", encoding='utf-8') as f: text = f.read() # Apply text preprocessing for Orpheus TTS to prevent repetition issues if TTS_MODEL.lower() == "orpheus": text = preprocess_text_for_tts(text) yield "Applied text preprocessing for Orpheus TTS" lines = text.split("\n") # Filter out empty lines lines = [line.strip() for line in lines if line.strip()] # Set the voices to be used - now using the new voice mapping system narrator_voice, dialogue_voice = get_narrator_and_dialogue_voices( engine_name=TTS_MODEL, narrator_gender=narrator_gender ) # Setup directories temp_audio_dir = "temp_audio" temp_line_audio_dir = os.path.join(temp_audio_dir, "line_segments") empty_directory(temp_audio_dir) os.makedirs(temp_audio_dir, exist_ok=True) os.makedirs(temp_line_audio_dir, exist_ok=True) # Batch processing parameters semaphore = asyncio.Semaphore(TTS_MAX_PARALLEL_REQUESTS_BATCH_SIZE) # Initial setup for chapters chapter_index = 1 current_chapter_audio = f"Introduction.wav" chapter_files = [] # First pass: Generate audio for each line independently total_size = len(lines) progress_counter = 0 # For tracking progress with tqdm in an async context progress_bar = tqdm(total=total_size, unit="line", desc="Audio Generation Progress") # Maps chapters to their line indices chapter_line_map = {} async def process_single_line(line_index, line): async with semaphore: nonlocal progress_counter if not line or is_only_punctuation(line): progress_bar.update(1) progress_counter += 1 return None # Split the line into annotated parts annotated_parts = split_and_annotate_text(line) # Create combined audio using PyDub for seamless concatenation combined_audio = AudioSegment.empty() for part in annotated_parts: text_to_speak = part["text"].strip() if not text_to_speak or is_only_punctuation(text_to_speak): continue voice_to_speak_in = narrator_voice if part["type"] == "narration" else dialogue_voice # strip all double quotes from the text to speak text_to_speak = text_to_speak.replace('"', '').replace('\\', '') # Create temporary file for this part temp_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False) temp_path = temp_file.name temp_file.close() try: # Generate audio for the part using retry mechanism audio_buffer = await generate_audio_with_retry( async_openai_client, TTS_MODEL, text_to_speak, voice_to_speak_in ) # Write part audio to temp file with open(temp_path, "wb") as temp_wav: temp_wav.write(audio_buffer) # Load as AudioSegment and add to combined audio part_segment = AudioSegment.from_wav(temp_path) combined_audio += part_segment except Exception as e: # Log the error for debugging print(f"Warning: Failed to generate audio for text: '{text_to_speak[:50]}...' - Error: {str(e)}") # Skip this part and continue with next part finally: # Always clean up temp file if os.path.exists(temp_path): os.unlink(temp_path) # Check if we have any audio content before exporting if len(combined_audio) == 0: # If no audio was generated for this line, skip it entirely progress_bar.update(1) progress_counter += 1 return None # Write this line's audio to a temporary file line_audio_path = os.path.join(temp_line_audio_dir, f"line_{line_index:06d}.wav") combined_audio.export(line_audio_path, format="wav") # Update progress bar progress_bar.update(1) progress_counter += 1 return { "index": line_index, "is_chapter_heading": check_if_chapter_heading(line), "line": line, } # Create tasks and store them with their index for result collection tasks = [] task_to_index = {} for i, line in enumerate(lines): task = asyncio.create_task(process_single_line(i, line)) tasks.append(task) task_to_index[task] = i # Initialize results_all list results_all = [None] * len(lines) # Process tasks with progress updates last_reported = -1 while tasks: done, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED) # Store results as tasks complete for completed_task in done: idx = task_to_index[completed_task] results_all[idx] = completed_task.result() tasks = list(pending) # Only yield if the counter has changed if progress_counter > last_reported: last_reported = progress_counter percent = (progress_counter / total_size) * 100 yield f"Generating audiobook. Progress: {percent:.1f}%" # All tasks have completed at this point and results_all is populated results = [r for r in results_all if r is not None] # Filter out empty lines progress_bar.close() # Filter out empty lines (same as in your original code) results = [r for r in results_all if r is not None] yield "Completed generating audio for all lines" # Second pass: Organize by chapters chapter_organization_bar = tqdm(total=len(results), unit="result", desc="Organizing Chapters") for result in sorted(results, key=lambda x: x["index"]): # Check if this is a chapter heading if result["is_chapter_heading"]: chapter_index += 1 current_chapter_audio = f"{sanitize_filename(result['line'])}.wav" if current_chapter_audio not in chapter_files: chapter_files.append(current_chapter_audio) chapter_line_map[current_chapter_audio] = [] # Add this line index to the chapter chapter_line_map[current_chapter_audio].append(result["index"]) chapter_organization_bar.update(1) chapter_organization_bar.close() yield "Organizing audio by chapters complete" # Third pass: Concatenate audio files for each chapter in order chapter_assembly_bar = tqdm(total=len(chapter_files), unit="chapter", desc="Assembling Chapters") for chapter_file in chapter_files: # Use FFmpeg-based assembly instead of PyDub for memory efficiency assemble_chapter_with_ffmpeg( chapter_file, chapter_line_map[chapter_file], temp_line_audio_dir, temp_audio_dir ) chapter_assembly_bar.update(1) yield f"Assembled chapter: {chapter_file}" chapter_assembly_bar.close() yield "Completed assembling all chapters" # Post-processing steps post_processing_bar = tqdm(total=len(chapter_files)*2, unit="task", desc="Post Processing") # Add silence to each chapter file using FFmpeg for chapter_file in chapter_files: chapter_path = os.path.join(temp_audio_dir, chapter_file) # Use FFmpeg-based silence addition instead of PyDub for memory efficiency add_silence_to_chapter_with_ffmpeg(chapter_path, 1000) # 1 second silence post_processing_bar.update(1) yield f"Added silence to chapter: {chapter_file}" m4a_chapter_files = [] # Convert all chapter files to M4A format for chapter_file in chapter_files: chapter_name = chapter_file.split('.')[0] m4a_chapter_files.append(f"{chapter_name}.m4a") # Convert WAV to M4A for better compatibility with timestamps and metadata convert_audio_file_formats("wav", "m4a", temp_audio_dir, chapter_name) post_processing_bar.update(1) yield f"Converted chapter to M4A: {chapter_name}" post_processing_bar.close() # Clean up temp line audio files shutil.rmtree(temp_line_audio_dir) yield "Cleaned up temporary files" if generate_m4b_audiobook_file: # Merge all chapter files into a final m4b audiobook yield "Creating M4B audiobook file..." merge_chapters_to_m4b(book_path, m4a_chapter_files) yield "M4B audiobook created successfully" else: # Merge all chapter files into a standard M4A audiobook yield "Creating final audiobook..." merge_chapters_to_standard_audio_file(m4a_chapter_files) convert_audio_file_formats("m4a", output_format, "generated_audiobooks", "audiobook") yield f"Audiobook in {output_format} format created successfully" def apply_emotion_tags_to_multi_voice_data(json_data_array): """ Dynamically apply pre-processed emotion tags to multi-voice JSONL data. This function reads emotion-enhanced text from tag_added_lines_chunks.txt and applies it to the speaker-attributed JSONL data in memory, preserving speaker attributions while using the enhanced text content. Args: json_data_array (list): Original speaker-attributed JSONL data Returns: tuple: (success, json_data_array, message) - success (bool): True if emotion tags were successfully applied - json_data_array (list): Updated JSONL data with emotion tags - message (str): Status message describing the result """ if not os.path.exists("tag_added_lines_chunks.txt"): return False, json_data_array, "No pre-processed emotion tags found" try: # Read the enhanced lines from tag_added_lines_chunks.txt with open("tag_added_lines_chunks.txt", "r", encoding='utf-8') as f: enhanced_lines = f.read().split('\n') # Dynamically create enhanced JSONL data by matching enhanced lines with original speaker attributions if len(enhanced_lines) == len(json_data_array): for i, item in enumerate(json_data_array): item["line"] = enhanced_lines[i] return True, json_data_array, "Successfully applied pre-processed emotion tags" else: return False, json_data_array, f"Line count mismatch: {len(enhanced_lines)} enhanced lines vs {len(json_data_array)} speaker-attributed lines" except Exception as e: return False, json_data_array, f"Error applying emotion tags: {str(e)}" async def generate_audio_with_multiple_voices(output_format, narrator_gender, generate_m4b_audiobook_file=False, book_path="", add_emotion_tags=False): # Path to the JSONL file containing speaker-attributed lines """ Generate an audiobook in the specified format using multiple voices for each line Uses the provided JSONL file to map speaker names to voices. The JSONL file should contain entries with the following format: { "line": , "speaker": } The function will generate audio for each line independently and then concatenate the audio files for each chapter in order. The final audiobook will be saved in the "generated_audiobooks" directory with the name "audiobook.". :param output_format: The desired format of the final audiobook (e.g. "m4a", "mp3") :param narrator_gender: The gender of the narrator voice (e.g. "male", "female") :param generate_m4b_audiobook_file: Whether to generate an M4B audiobook file instead of a standard M4A file :param book_path: The path to the book file (required for generating an M4B audiobook file) :param add_emotion_tags: Whether to use pre-applied emotion tags in the audiobook. Defaults to False. """ # Early validation for M4B generation if generate_m4b_audiobook_file: yield "Validating book file for M4B audiobook generation..." is_valid, error_message, metadata = validate_book_for_m4b_generation(book_path) if not is_valid: raise ValueError(f"❌ Book validation failed: {error_message}") yield f"✅ Book validation successful! Title: {metadata.get('Title', 'Unknown')}, Author: {metadata.get('Author(s)', 'Unknown')}" file_path = 'speaker_attributed_book.jsonl' json_data_array = [] # Open the JSONL file and read it line by line with open(file_path, 'r', encoding='utf-8') as file: for line in file: # Parse each line as a JSON object json_object = json.loads(line.strip()) # Append the parsed JSON object to the array json_data_array.append(json_object) yield "Loaded speaker-attributed lines from JSONL file" # Apply emotion tags if requested and available if add_emotion_tags: success, json_data_array, message = apply_emotion_tags_to_multi_voice_data(json_data_array) if success: yield f"✅ {message}" else: yield f"⚠️ {message}" yield "Falling back to original text without emotion tags" else: # Check if emotion tags exist in the original JSONL data and remove them if user doesn't want them has_emotion_tags = any( '' in item.get('line', '') or '' in item.get('line', '') or '' in item.get('line', '') or '' in item.get('line', '') or '' in item.get('line', '') or '' in item.get('line', '') or '' in item.get('line', '') or '' in item.get('line', '') for item in json_data_array ) if has_emotion_tags: yield "Removing existing emotion tags from JSONL data as per user preference" import re for item in json_data_array: if "line" in item and item["line"]: # Remove emotion tags from the line line_without_tags = re.sub(r'<(?:laugh|chuckle|sigh|cough|sniffle|groan|yawn|gasp)>\s*', '', item["line"]) item["line"] = line_without_tags # Apply text preprocessing for Orpheus TTS to prevent repetition issues if TTS_MODEL.lower() == "orpheus": for item in json_data_array: if "line" in item and item["line"]: item["line"] = preprocess_text_for_tts(item["line"]) yield "Applied text preprocessing for Orpheus TTS" # Load mappings for character gender character_gender_map = read_json("character_gender_map.json") # Get narrator voice using the new voice mapping system narrator_voice = find_voice_for_gender_score("narrator", character_gender_map, TTS_MODEL, narrator_gender) yield "Loaded voice mappings and selected narrator voice" # Setup directories temp_audio_dir = "temp_audio" temp_line_audio_dir = os.path.join(temp_audio_dir, "line_segments") empty_directory(temp_audio_dir) os.makedirs(temp_audio_dir, exist_ok=True) os.makedirs(temp_line_audio_dir, exist_ok=True) yield "Set up temporary directories for audio processing" # Batch processing parameters semaphore = asyncio.Semaphore(TTS_MAX_PARALLEL_REQUESTS_BATCH_SIZE) # Initial setup for chapters chapter_index = 1 current_chapter_audio = f"Introduction.wav" chapter_files = [] # First pass: Generate audio for each line independently # and track chapter organization chapter_line_map = {} # Maps chapters to their line indices progress_counter = 0 # For tracking progress with tqdm in an async context total_lines = len(json_data_array) progress_bar = tqdm(total=total_lines, unit="line", desc="Audio Generation Progress") yield "Generating audio..." async def process_single_line(line_index, doc): async with semaphore: nonlocal progress_counter line = doc["line"].strip() if not line or is_only_punctuation(line): progress_bar.update(1) progress_counter += 1 return None speaker = doc["speaker"] speaker_voice = find_voice_for_gender_score(speaker, character_gender_map, TTS_MODEL, narrator_gender) # Split the line into annotated parts annotated_parts = split_and_annotate_text(line) # Create combined audio using PyDub for seamless concatenation combined_audio = AudioSegment.empty() for part in annotated_parts: text_to_speak = part["text"].strip() if not text_to_speak or is_only_punctuation(text_to_speak): continue voice_to_speak_in = narrator_voice if part["type"] == "narration" else speaker_voice # strip all double quotes and backslashes from the text to speak text_to_speak = text_to_speak.replace('"', '').replace('\\', '') # Create temporary file for this part temp_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False) temp_path = temp_file.name temp_file.close() try: # Generate audio for the part using retry mechanism audio_buffer = await generate_audio_with_retry( async_openai_client, TTS_MODEL, text_to_speak, voice_to_speak_in ) # Write part audio to temp file with open(temp_path, "wb") as temp_wav: temp_wav.write(audio_buffer) # Load as AudioSegment and add to combined audio part_segment = AudioSegment.from_wav(temp_path) combined_audio += part_segment except Exception as e: # Log the error for debugging print(f"Warning: Failed to generate audio for text: '{text_to_speak[:50]}...' - Error: {str(e)}") # Skip this part and continue with next part finally: # Always clean up temp file if os.path.exists(temp_path): os.unlink(temp_path) # Check if we have any audio content before exporting if len(combined_audio) == 0: # If no audio was generated for this line, skip it entirely progress_bar.update(1) progress_counter += 1 return None # Write this line's audio to a temporary file line_audio_path = os.path.join(temp_line_audio_dir, f"line_{line_index:06d}.wav") combined_audio.export(line_audio_path, format="wav") # Update progress bar progress_bar.update(1) progress_counter += 1 return { "index": line_index, "is_chapter_heading": check_if_chapter_heading(line), "line": line } # Create tasks and store them with their index for result collection tasks = [] task_to_index = {} for i, doc in enumerate(json_data_array): task = asyncio.create_task(process_single_line(i, doc)) tasks.append(task) task_to_index[task] = i # Initialize results_all list results_all = [None] * len(json_data_array) # Process tasks with progress updates last_reported = -1 while tasks: done, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED) # Store results as tasks complete for completed_task in done: idx = task_to_index[completed_task] results_all[idx] = completed_task.result() tasks = list(pending) # Only yield if the counter has changed if progress_counter > last_reported: last_reported = progress_counter percent = (progress_counter / total_lines) * 100 yield f"Generating audiobook. Progress: {percent:.1f}%" # All tasks have completed at this point and results_all is populated results = [r for r in results_all if r is not None] # Filter out empty lines progress_bar.close() # Filter out empty lines (same as in your original code) results = [r for r in results_all if r is not None] yield "Completed generating audio for all lines" # Second pass: Organize by chapters chapter_organization_bar = tqdm(total=len(results), unit="result", desc="Organizing Chapters") yield "Organizing lines into chapters" for result in sorted(results, key=lambda x: x["index"]): # Check if this is a chapter heading if result["is_chapter_heading"]: chapter_index += 1 current_chapter_audio = f"{sanitize_filename(result['line'])}.wav" if current_chapter_audio not in chapter_files: chapter_files.append(current_chapter_audio) chapter_line_map[current_chapter_audio] = [] # Add this line index to the chapter chapter_line_map[current_chapter_audio].append(result["index"]) chapter_organization_bar.update(1) chapter_organization_bar.close() yield f"Organized {len(results)} lines into {len(chapter_files)} chapters" # Third pass: Concatenate audio files for each chapter in order chapter_assembly_bar = tqdm(total=len(chapter_files), unit="chapter", desc="Assembling Chapters") for chapter_file in chapter_files: # Use FFmpeg-based assembly instead of PyDub for memory efficiency assemble_chapter_with_ffmpeg( chapter_file, chapter_line_map[chapter_file], temp_line_audio_dir, temp_audio_dir ) chapter_assembly_bar.update(1) yield f"Assembled chapter: {chapter_file}" chapter_assembly_bar.close() yield "Completed assembling all chapters" # Post-processing steps post_processing_bar = tqdm(total=len(chapter_files)*2, unit="task", desc="Post Processing") # Add silence to each chapter file using FFmpeg for chapter_file in chapter_files: chapter_path = os.path.join(temp_audio_dir, chapter_file) # Use FFmpeg-based silence addition instead of PyDub for memory efficiency add_silence_to_chapter_with_ffmpeg(chapter_path, 1000) # 1 second silence post_processing_bar.update(1) yield f"Added silence to chapter: {chapter_file}" m4a_chapter_files = [] # Convert all chapter files to M4A format for chapter_file in chapter_files: chapter_name = chapter_file.split('.')[0] m4a_chapter_files.append(f"{chapter_name}.m4a") # Convert WAV to M4A for better compatibility with timestamps and metadata convert_audio_file_formats("wav", "m4a", temp_audio_dir, chapter_name) post_processing_bar.update(1) yield f"Converted chapter to M4A: {chapter_name}" post_processing_bar.close() # Clean up temp line audio files yield "Cleaning up temporary files" shutil.rmtree(temp_line_audio_dir) yield "Temporary files cleanup complete" if generate_m4b_audiobook_file: # Merge all chapter files into a final m4b audiobook yield "Creating M4B audiobook file..." merge_chapters_to_m4b(book_path, m4a_chapter_files) yield "M4B audiobook created successfully" else: # Merge all chapter files into a standard M4A audiobook yield "Creating final audiobook..." merge_chapters_to_standard_audio_file(m4a_chapter_files) convert_audio_file_formats("m4a", output_format, "generated_audiobooks", "audiobook") yield f"Audiobook in {output_format} format created successfully" async def process_audiobook_generation(voice_option, narrator_gender, output_format, book_path, add_emotion_tags=False): is_audio_generator_api_up, message = await check_if_audio_generator_api_is_up(async_openai_client) if not is_audio_generator_api_up: raise Exception(message) generate_m4b_audiobook_file = False if output_format == "M4B (Chapters & Cover)": generate_m4b_audiobook_file = True try: if voice_option == "Single Voice": yield "\n🎧 Generating audiobook with a **single voice**..." await asyncio.sleep(1) async for line in generate_audio_with_single_voice(output_format.lower(), narrator_gender, generate_m4b_audiobook_file, book_path, add_emotion_tags): yield line elif voice_option == "Multi-Voice": yield "\n🎭 Generating audiobook with **multiple voices**..." await asyncio.sleep(1) async for line in generate_audio_with_multiple_voices(output_format.lower(), narrator_gender, generate_m4b_audiobook_file, book_path, add_emotion_tags): yield line yield f"\n🎧 Audiobook is generated ! You can now download it in the Download section below. Click on the blue download link next to the file name." except ValueError as e: # Handle validation errors specifically error_msg = str(e) if "Book validation failed" in error_msg: yield f"\n❌ **Book Validation Error**: {error_msg}" yield "\n💡 **Troubleshooting Tips:**" yield " • Ensure the book file path is correct and the file exists" yield " • Verify the book file is a supported ebook format (EPUB, MOBI, PDF, etc.)" yield " • Check that Calibre is properly installed and ebook-meta command is available" yield " • Make sure the book file is not corrupted" yield " • Ensure the book file contains extractable metadata and cover image" else: yield f"\n❌ **Validation Error**: {error_msg}" raise e except Exception as e: yield f"\n❌ **Unexpected Error**: {str(e)}" raise e async def main(): os.makedirs("generated_audiobooks", exist_ok=True) # Default values book_path = "./sample_book_and_audio/The Adventure of the Lost Treasure - Prakhar Sharma.epub" generate_m4b_audiobook_file = False output_format = "aac" # Prompt user for voice selection print("\n🎙️ **Audiobook Voice Selection**") voice_option = input("🔹 Enter **1** for **Single Voice** or **2** for **Multiple Voices**: ").strip() # Prompt user for audiobook type selection print("\n🎙️ **Audiobook Type Selection**") print("🔹 Do you want the audiobook in M4B format (the standard format for audiobooks) with chapter timestamps and embedded book cover ? (Needs calibre and ffmpeg installed)") print("🔹 OR do you want a standard audio file in either of ['aac', 'm4a', 'mp3', 'wav', 'opus', 'flac', 'pcm'] formats without any of the above features ?") audiobook_type_option = input("🔹 Enter **1** for **M4B audiobook format** or **2** for **Standard Audio File**: ").strip() if audiobook_type_option == "1": is_calibre_installed = check_if_calibre_is_installed() if not is_calibre_installed: print("⚠️ Calibre is not installed. Please install it first and make sure **calibre** and **ebook-meta** commands are available in your PATH.") return is_ffmpeg_installed = check_if_ffmpeg_is_installed() if not is_ffmpeg_installed: print("⚠️ FFMpeg is not installed. Please install it first and make sure **ffmpeg** and **ffprobe** commands are available in your PATH.") return # Check if a path is provided via command-line arguments if len(sys.argv) > 1: book_path = sys.argv[1] print(f"📂 Using book file from command-line argument: **{book_path}**") else: # Ask user for book file path if not provided input_path = input("\n📖 Enter the **path to the book file**, needed for metadata and cover extraction. (Press Enter to use default): ").strip() if input_path: book_path = input_path print(f"📂 Using book file: **{book_path}**") print("✅ Book path set. Proceeding...\n") # Early validation of the book file for M4B generation print("🔍 Validating book file for M4B audiobook generation...") is_valid, error_message, metadata = validate_book_for_m4b_generation(book_path) if not is_valid: print(f"❌ **Book validation failed**: {error_message}") print("\n💡 **Troubleshooting Tips:**") print(" • Ensure the book file path is correct and the file exists") print(" • Verify the book file is a supported ebook format (EPUB, MOBI, PDF, etc.)") print(" • Check that Calibre is properly installed and ebook-meta command is available") print(" • Make sure the book file is not corrupted") print(" • Ensure the book file contains extractable metadata and cover image") return print(f"✅ **Book validation successful!**") print(f" • Title: {metadata.get('Title', 'Unknown')}") print(f" • Author: {metadata.get('Author(s)', 'Unknown')}") print(f" • Cover image: Successfully extracted") print() generate_m4b_audiobook_file = True else: # Prompt user for audio format selection print("\n🎙️ **Audiobook Output Format Selection**") output_format = input("🔹 Choose between ['aac', 'm4a', 'mp3', 'wav', 'opus', 'flac', 'pcm']. ").strip() if(output_format not in ["aac", "m4a", "mp3", "wav", "opus", "flac", "pcm"]): print("\n⚠️ Invalid output format! Please choose from the give options") return # Prompt user for narrator's gender selection print("\n🎙️ **Audiobook Narrator Voice Selection**") narrator_gender = input("🔹 Enter **male** if you want the book to be read in a male voice or **female** if you want the book to be read in a female voice: ").strip() if narrator_gender not in ["male", "female"]: print("\n⚠️ Invalid narrator gender! Please choose from the give options") return # Prompt user for emotion tags option if using Orpheus TTS add_emotion_tags = False if TTS_MODEL.lower() == "orpheus": print("\n🎭 **Emotion Tags Enhancement (Orpheus TTS)**") print("🔹 Emotion tags add natural expressions like laughter, sighs, gasps to your audiobook") print("🔹 Available tags: , , , , , , , ") emotion_tags_option = input("🔹 Do you want to use emotion tags in the audiobook? Enter **yes** or **no**: ").strip().lower() if emotion_tags_option in ["yes", "y", "true", "1"]: add_emotion_tags = True print("✅ Emotion tags will be used in the audiobook!") else: print("ℹ️ Emotion tags disabled. Standard narration will be used.") else: print(f"\nℹ️ **Note**: Emotion tags are only available with Orpheus TTS. Current engine: {TTS_MODEL}") start_time = time.time() if voice_option == "1": print("\n🎧 Generating audiobook with a **single voice**...") async for line in generate_audio_with_single_voice(output_format, narrator_gender, generate_m4b_audiobook_file, book_path, add_emotion_tags): print(line) elif voice_option == "2": print("\n🎭 Generating audiobook with **multiple voices**...") async for line in generate_audio_with_multiple_voices(output_format, narrator_gender, generate_m4b_audiobook_file, book_path, add_emotion_tags): print(line) else: print("\n⚠️ Invalid option! Please restart and enter either **1** or **2**.") return print(f"\n🎧 Audiobook is generated ! The audiobook is saved as **audiobook.{'m4b' if generate_m4b_audiobook_file else output_format}** in the **generated_audiobooks** directory in the current folder.") end_time = time.time() execution_time = end_time - start_time print(f"\n⏱️ **Execution Time:** {execution_time:.6f} seconds\n✅ Audiobook generation complete!") if __name__ == "__main__": asyncio.run(main())