import logging
from sqlalchemy import URL, create_engine, text
import requests
import json
import asyncio
import datetime
# necessary pip install comand
# pip install sqlalchemy pymysql requests asyncio

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
# logger outpit to stdout
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)

# Database connection details
DB_HOST = "mysql-2e01bea8-bico-f665.a.aivencloud.com"
DB_USERNAME = "avnadmin"
DB_PASSWORD = "AVNS_kT0yX_62xqyaZnUJUlW"
DB_DATABASE = "defaultdb"
DB_PORT = "28954"

# Local DB
DB_HOST = "127.0.0.1"
DB_USERNAME = "root"
DB_PASSWORD = "password"
DB_DATABASE = "defaultdb"
DB_PORT = "50003"

OPENROUTER_API_KEY = "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"
OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"

# Create URL object instead of string concatenation
connection_url = URL.create(
    "mysql+pymysql",
    username=DB_USERNAME,
    password=DB_PASSWORD,
    host=DB_HOST,
    port=DB_PORT,
    database=DB_DATABASE,
)

# Create a SQLAlchemy engine
engine = create_engine(connection_url)

# Predefined prompt template
PROMPT_TEMPLATE = """
You are an expert chemical data analyst specializing in ingredient nomenclature and identification.

Your task is to analyze the provided list of chemical ingredients and identify groups of entries that represent the exact same chemical substance, based solely on interpreting the provided names.

The input is a JSON list of objects, each with an 'id' (a unique identifier) and a 'name' (the ingredient name).

Instructions for Grouping (Based ONLY on Name):

Identify Synonyms: Group entries whose names are common synonyms for the identical chemical substance (e.g., different naming conventions like common names, INCI names, or IUPAC names). For example, 'Cinnamal' and 'Cinnamaldehyde' typically refer to the same chemical and should be grouped. Handle Typos & Formatting: Group entries where names likely refer to the same substance but differ due to minor typos, variations in capitalization, spacing, or punctuation. For example, 'Cinnamyl Alcohol' and 'Cinnamal alcohol' might be grouped if context suggests 'Cinnamal alcohol' is a likely typo for 'Cinnamyl Alcohol'. Use chemical knowledge to infer if a typo is likely vs. representing a different substance. Ignore Annotations: Treat names as equivalent if they only differ by descriptive text in parentheses or similar non-essential annotations. For example, group 'Cinnamomum Zeylanicum Leaf Oil' and 'cinnamomum zeylanicum leaf oil (éterický olej skořice)'. Crucial - Distinguish Related but Different: Be extremely careful not to group distinct substances, even if their names are similar or derived from the same source. For example, 'Cinnamomum Zeylanicum Bark Oil' is different from 'Cinnamomum Zeylanicum Leaf Oil' and should not be grouped together, even though both come from Cinnamomum Zeylanicum. Group only if the names indicate the exact same chemical entity or defined substance/extract. Single Entries: Do not create groups for entries that have no identified duplicates or synonyms in the provided list. Canonical Entry Selection:

Within each identified group of duplicates/synonyms (a group must have more than one member), designate one ID as the 'canonical' or 'preferred' entry. Use the following criteria in order of priority:

The entry with the name that appears most chemically formal, standard, or complete (e.g., proper capitalization, potentially IUPAC name if discernible, fewer annotations). The entry with the shortest, clearest base name (often the one without parenthetical annotations). The entry with the lowest numerical ID (as a final tie-breaker). Output Format:

Return the result as a JSON object where keys are the 'canonical' IDs and values are lists of all IDs belonging to that group (including the canonical ID itself). Only include entries that are part of a group with two or more members.

Example Input Snippet: [ { "id": 408, "name": "Cinnamyl Alcohol" }, { "id": 411, "name": "Cinnamal" }, { "id": 474, "name": "Cinnamal alcohol" }, { "id": 698, "name": "Cinnamomum Zeylanicum Bark Oil" }, { "id": 16890, "name": "Cinnamomum Zeylanicum Leaf Oil" }, { "id": 17850, "name": "Cinnamaldehyde" }, { "id": 43357, "name": "cinnamomum zeylanicum leaf oil (éterický olej skořice)" } ]

Example Output Structure based on the snippet: { "411": [411, 17850], "408": [408, 474], "16890": [16890, 43357] }

Analyze the provided list and generate the JSON output containing the identified duplicate/synonym groups based only on the names.
"""


async def fetch_ingredients_batch(offset: int, limit: int):
    """Fetch a batch of ingredients from the database."""
    # Simulated database fetch
    # Replace with actual database query
    ingredients = []

    query = text("SELECT id, name FROM ingredients WHERE name LIKE 'a%' OR name LIKE 'b%' ORDER BY name asc LIMIT :limit OFFSET :offset ")
    with engine.connect() as conn:
        result = conn.execute(query, {"limit": limit, "offset": offset})
        for row in result.fetchall():
            ingredient = {
                "id": row.id,
                "name": row.name,
                # Add other fields as necessary
            }
            ingredients.append(ingredient)
    return ingredients


async def call_llm_with_ingredients(ingredients_batch):
    """Call the LLM with a batch of ingredients."""
    # Format ingredients text for prompt
    ingredients_text_JSON = json.dumps(ingredients_batch, ensure_ascii=False)
    #print(ingredients_text_JSON)

    # write prompt to new file with timestamp
    now = datetime.datetime.now()
    filename = f"prompt_{now.strftime('%Y%m%d_%H%M%S')}.txt"
    # write the prompt to file open in append mode
    append_write = open(filename, "a", encoding="utf-8")
    append_write.write(PROMPT_TEMPLATE + "\n\n" + ingredients_text_JSON)
    append_write.close()


    response = requests.post(
        url="https://openrouter.ai/api/v1/chat/completions",
        headers={
            "Authorization": "Bearer " + OPENROUTER_API_KEY,
            "Content-Type": "application/json",
            "HTTP-Referer": "greenscan.cz",  # Optional. Site URL for rankings on openrouter.ai.
            "X-Title": "GreenScan",  # Optional. Site title for rankings on openrouter.ai.
        },
        data=json.dumps({
            "model": "google/gemini-2.0-flash-001",
            #"model": "google/gemini-2.5-pro-exp-03-25:free",
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": PROMPT_TEMPLATE + "\n\n" + ingredients_text_JSON
                        }
                    ]
                }
            ],

        })
    )


    if response.status_code == 200:
        # Process the response
        data = response.json()
        if 'choices' in data and len(data['choices']) > 0:
            print(data['choices'][0]['message']['content'])
            # Parse the response content as json
            try:
                # maybe contains ```json
                # remove the ```json and ``` from the response
                content = data['choices'][0]['message']['content']
                if content.startswith("```json"):
                    content = content[7:]
                if content.endswith("```"):
                    content = content[:-3]
                # Parse the JSON content

                response_content = json.loads(content)
                # Check if the response is a valid JSON object
                return response_content
            except json.JSONDecodeError as e:
                logger.error(f"Failed to decode JSON response: {e}")
                return None
        else:
            logger.error("No valid response from LLM.")
            return None
    else:
        logger.error(f"Error calling LLM: {response.status_code} - {response.text}")


async def process_all_ingredients():
    """Process all ingredients in batches."""
    batch_size = 1000
    offset = 0
    total_processed = 0
    processed_ingredients = []

    all_ingredients = await fetch_ingredients_batch(0, 100_000)

    while True:
        # Fetch batch of ingredients
        ingredients_batch = await fetch_ingredients_batch(offset, batch_size)

        if not ingredients_batch:
            logger.info("No more ingredients to process.")
            break

        # Process the batch with LLM
        logger.info(f"Processing batch of {len(ingredients_batch)} ingredients at offset {offset}")
        response = await call_llm_with_ingredients(ingredients_batch)

        if response:
            processed_ingredients.append(response)

        total_processed += len(ingredients_batch)
        offset += batch_size


        # print
        for main_ingredient_id, ids in response.items():
            # Check if the key is a valid integer
            # find the ingredient by id (key) in all_ingredients
            main_ingredient = next((i for i in all_ingredients if str(i['id']) == str(main_ingredient_id)), None)
            if not main_ingredient:
                logger.error(f"Main ingredient with ID {main_ingredient_id} not found in all ingredients.")
                continue
            sub_ingredients = []
            for ingredient_id in ids:
                # find the ingredient by id (value) in all_ingredients
                ingredient = next((i for i in all_ingredients if str(i['id']) == str(ingredient_id)), None)
                if ingredient:
                    sub_ingredients.append(ingredient)

            # print the main ingredient and sub ingredients
            print(f"{main_ingredient['name']}")
            for ingredient in sub_ingredients:
                print(f" - {ingredient['name']}")


        await asyncio.sleep(1)

    # Save final results
    #save_processed_data()
    logger.info(f"Completed processing {total_processed} ingredients")


# Main execution function
async def main():
    logger.info("Starting ingredient processing")
    await process_all_ingredients()


# Run the async program
if __name__ == "__main__":
    asyncio.run(main())