import {
  AgentType,
  BaseFlowTool,
  FlowFile,
  FlowStep,
  FlowTool,
} from '../fileflow.interface'
import { defaultTransformerParameters, executeAzureTool, NON_PRINTABLE_REGEX } from '../utils'
import { DocuPandaTool } from './DocuPandaTool'
import { DocuPandaAsImageTool } from './DocuPandaAsImageTool'
import { DocuPandaRemoveWatermarkTool } from './DocuPandaRemoveWatermarkTool'
import { FileflowServiceInterface } from '../fileflow.interface'
import { CHATGPT_COSTS, JsonFormData } from '@cheaseed/node-utils'
import { generateDocupandaPageOutputPreview } from '../docupanda-utils';
import { UNDERSCORE } from '../fidoc-constants'

const INSTRUCTIONS = `
# User Prompt #

## Input ##
- Your input is a JSON list of arrays of strings, representing a table of financial information.
Example JSON input:
	{
  "tableList": [
    ["", "Quarter", "Year to Date", "Inception to Date"],
    ["Beginning net asset value", "$ 269,917", "$ 256,695", "$ -"],
    ["Contributions", "1,186", "4,706", "269,519"]
		]
	}
---

## Definitions ##
- The first array of the tableList is the "column header" of the financial table.
- The first element of each array following the column header is called the "row header".
---

## Rules ##
- **Do NOT remove any elements in the array, modifying the contents is fine, but if an input array has 7 elements, then the output array also needs to contain 7 elements.**
- Empty array elements are expected and are fine (example: "").
- If you encounter any elements or scenarios not covered by the rules, leave the elements unchanged.
---

## Instructions ##
- Please follow these instructions precisely.
- Apply the rules in the order provided.
- Do not apply any transformations, corrections, or formatting beyond what is explicitly specified in the instructions. 
- **Do not make assumptions or inferences about the data that are not covered by the instructions.**
---

- **Summary of Steps:**
  - **Step 1:** Detect and convert non-English text in column headers and row headers to US English; convert non-English numerical formats to standard accounting formats.  
  - **Step 2:** Correct the placement of '$' symbols according to the specified rules without altering elements that don't meet any conditions.
  - **Step 3:** Properly indent 'row headers' that are subsets of the previous 'row header'.
  - **Final Review:** Verify that all changes are correctly applied and the output maintains the same structure and integrity as the input.
---

## Step 1 - Detect and Convert Non-English Text and Numerical Formats.
- Process each element in the order of the rules provided. If an element does not meet the conditions of any rule, it must remain unchanged. Do not infer or apply transformations beyond these rules.
- **Do not remove or shift elements in the array. All transformations should occur within the existing structure of the array.**

- **Rule 0: Preserve Empty Strings**
    - **If an element is an empty string (""), leave it unchanged. Do not remove it or attempt to translate or convert it.**

- **Rule 1: Enhanced Language Detection and Translation**
    - **Examine** the text in each array element to determine if it is written in a language other than US English.
    - **Indicators of Non-English Text** :
        - **Presence of Accented Characters** (e.g., é, ñ, ö, è, à, ç).
        - **Detection of Common Non-English Words** :
            - **French Financial Terms** :
	            - "Actif", "Passif", "Immobilisations", "Frais", "Préliminaires", "Charges", "Produits", "Stocks", "Créances", "Trésorerie", "Total", "Général", "Période", "Courante", "Précédente", "Amortissements", "Provision", "Valeur", "Net", "Brut", "Exercices".
		    - ** Common French Words and Articles** :
                - Words like "Montant", "Revenus", "Compte", "Numéro", "Valeur", "Début", "Fin".
                - Articles like "le", "la", "les", "de", "des", "à", "en", "et", "sur", "plusieurs".
            - **Spanish** :
                - Words like "Número", "Cuenta", "Monto", "Ingresos", "Valor", "Inicio", "Fin".
            - **German** :
                - Words like "Betrag", "Summe", "Einnahmen", "Konto", "Anfang", "Ende".
        - **Unusual Grammatical Structures** :
            - Phrases that are not coherent in English or use syntax typical of another language.
    - **Translation Instructions** :
        - **If any indicators are found** :
            - **Translate** the text to US English using accurate and context-appropriate translations.
            - **Ensure** that financial terminology is correctly interpreted during translation.
            - **Use** a reliable English financial glossary for accurate terms.
        - **Do Not Modify** text that is already in US English, even if it contains accented characters due to borrowed words.
    - **Avoid False Positives** :
        - Be careful not to alter English words that may resemble non-English words.
        - **If uncertain**, it is safer to leave the text unchanged.
        - **Examples** :
            - Do not change "Resume" (to continue) to "Résumé" (a summary).
    - **Examples** :
	    - Input: ["", "ACTIF", "Du PERIODE COURANTE", "", "", "PERIODE PRECEDENTE au:31/12/20"]
			- Output: ["", "ASSETS", "OF CURRENT PERIOD", "", "", "PREVIOUS PERIOD as of:12/31/20"]
		- Input: ["", ". Capital social ou personnel", "21 000 000,00", "21 000 000,00"]
			- Output: ["", "  . Share capital", "21,000,000.00", "21,000,000.00"]
		- Input: ["1", "PRODUITS D'EXPLOITATION . Ventes de marchandises (en l'etat)", "", "", "", ""]
			- Output: ["1", "OPERATING INCOME . Sales of goods (as is)", "", "", "", ""]
        - Input: ["Revenus", "Montant total"]
            - Output: ["Revenue", "Total Amount"]
        - Input: ["Número de cuenta", "Valor inicial"]
            - Output: ["Account Number", "Beginning Value"]
        - Input: ["Anfangssaldo", "Endbestand"]
            - Output: ["Beginning Balance", "Ending Balance"]

- **Rule 2: Numerical Format Conversion.**
    - **Identify** numerical values that use non-US formatting, such as:
        - **French Format**: Spaces as thousand separators and commas as decimal points (e.g., "12 345 678,99").
        - **German/European Format**: Periods as thousand separators and commas as decimal points (e.g., "1.234.567,89").
    - **Convert** identified numerical values to standard US accounting format:
        - Use commas as thousand separators and periods as decimal points (e.g., "12,345,678.99").
	- Examples:
        - "12 345 678,99" -> "12,345,678.99"
        - "-1.234.567,89" -> "-1,234,567.89"
        - "(98.765,43)" -> "(98,765.43)"

- ### Feedback Loop: ###
    - After completing Step 1, thoroughly review the 'output' to ensure:
	    - **Ensure that the array structure has the same number of input and output elements for every array.** 
        - All non-English text has been accurately translated to US English.
        - All numerical values in non-US formats have been correctly converted to standard US accounting format.
        - No English text or standard US numerical formats have been altered.
        - Translations are contextually appropriate and maintain the financial meaning.
        - Consistency in Financial Terminology: Ensure that the same terms are used throughout the table for consistency.
---

## Step 2: Correct the placement of currency symbols ##
- Process each element in the order of the rules provided. If an element does not meet the conditions of any rule, it must remain unchanged. Do not infer or apply transformations beyond these rules.
- **Do not remove or shift elements in the array. All transformations should occur within the existing structure of the array.**

- Rule 1: If an element contains a '$' and is followed by a number (including negative numbers or numbers in parentheses), but separated by one or more spaces, **remove all spaces** between the '$' and the number.
	- Look for other formatting issues and correct them using the examples below.
  - Examples:
    - "$ 1234" -> "$1234"
    - "$ (1234)" -> "$(1234)"
    - "$ -1234" -> "$-1234"
    - "- $ -" -> "$-"
    - "- -" -> "-"
    - "--" -> "-"
    - "$ - -" -> "$-"

- Rule 2: If an element has a number (including negative numbers or numbers in parentheses) followed by a trailing '$', **remove the trailing** '$'.
  - Examples:
    - "1234$" -> "1234"
    - "(1234)$" -> "(1234)"
    - "-1234$" -> "-1234"

- Rule 3: If an element contains **only** a '$' (no other characters or spaces), and the next element in the array is numerical (including negative numbers or numbers in parentheses), **change only the first element** to an empty string "" and **prepend** '$' to the numerical value in the next element. **Do not make any other changes to any elements.
  - Examples:
    - ["$", "1234"] -> ["", "$1234"]
    - ["$", "(1234)"] -> ["", "$(1234)"]
    - ["$", "-1234"] -> ["", "$-1234"]

- Rule 4: If an element is empty (""), and the next element is numerical (including negative numbers or numbers in parentheses), *do not* add a '$' to the next element.
  - Examples:
    - ["", "1234"] -> ["", "1234"]
    - ["", "(1234)"] -> ["", "(1234)"]
    - ["", "-1234"] -> ["", "-1234"]

- Rule 5: **Do not add a '$' to any element unless it already contains a '$', or as specified in Rule 3.** If an element is a numerical value without a '$', it should remain unchanged unless modified according to Rule 1, Rule 2, or Rule 3.
  - Examples:
    - "1234" -> "1234" 
    - "(5678)" -> "(5678)"
    - "-1234" -> "-1234"

- Rule 6: If an element contains text but starts with "." remove one or more "." and the extra spaces before the text.
	- Examples:
		- ". Charges to be spread over several periods" -> "Charges to be spread over several periods"
		- ".. . Resold purchases of goods" -> "Resold purchases of goods"

- Rule 7: If an element starts with "* " followed by text, remove the starting "*" and the extra spaces before the text.
	- Do not remove "*" contained in the middle of the text string.
	- Examples:
		- "* Resold purchases of goods" -> "Resold purchases of goods"
		- "* Charges to be spread over several periods * Bond redemption premiums" -> "Charges to be spread over several periods * Bond redemption premiums"

- Rule 8: If an element starts with "=" followed by a numerical value, remove the starting "=" and the extra spaces before the numerical value.
	- Do not remove "=" contained in the middle of the text string.
	- Examples:
		- "= 1320" -> "1320"
		- "= - 12" -> "-12"
		- "= + 15" -> "+15"

- ### Feedback Loop ###
	- **Ensure that the array structure has the same number of input and output elements for every array.** 
  - **After applying the transformations, carefully review each element:**
  - **Ensure that no '$' symbols have been added to elements that did not originally contain one** , except as specified in Rule 3.
  - **Verify that numerical values without a '$' remain unchanged.**
  - Confirm that all currency symbols '$' are correctly positioned according to the rules.
  - Check that the array structure remains intact, with the same number of elements in each array.
---

## Step 3: Properly indent the "row header" column ##
- Remember, the 'row header' is the **first element** of each array following the column header. Only this element should be considered for indentation in Step 3.
- **Process every array in the tableList. Do not skip any arrays, even if the first element is empty.**
- **If the 'row header' (the first element of the array) is an empty string (""), do not remove the empty string.**
- Indent the 'row header' by two additional spaces if it represents a sub-category, detail, or component of the previous 'row header'.
- When determining whether to indent, examine and possibly modify only the 'row header' (the first element of each array). Do not consider or modify any other elements in the array, even if they contain similar terms.
- Indicators That a 'Row Header' Is a Subset Include:
	- The use of terms within the 'row header' itself, such as 'Series', 'Common Stock', 'Preferred Stock', 'Convertible Note', etc.
    - The 'row header' provides further specifications or types under the previous item.
- Examples:
	- Input: {
		["FIXED ASSETS", "NON-VALUE ASSETS (A)", "4,485,536.00", "3,038,011.00", "1,447,525.00", "1,839,476.00"],
		["", "Preliminary Expenses", "", "0.00", "0.00", "0.00"],
		["", "Charges to be spread over several periods", "4,485,536.00", "3,038,011.00", "1,447,525.00", "1,839,476.00"],}
	- Output: {
		["FIXED ASSETS", "NON-VALUE ASSETS (A)", "4,485,536.00", "3,038,011.00", "1,447,525.00", "1,839,476.00"],
		["", "  Preliminary Expenses", "", "0.00", "0.00", "0.00"],
		["", "  Charges to be spread over several periods", "4,485,536.00", "3,038,011.00", "1,447,525.00", "1,839,476.00"],}
	- Input: {
	  ["", "Amount", "Notes"],
		["Revenue", "100,000.00", "Total revenue for the period"],
		["Product Sales", "80,000.00", ""],
		["Service Income", "20,000.00", ""],
		["Expenses", "50,000.00", "Total expenses"],
		["Salaries", "30,000.00", "Staff salaries"],
		["Rent", "10,000.00", ""],
		["Utilities", "5,000.00", ""],
		["Miscellaneous", "5,000.00", "Various expenses"],
		["Net Income", "50,000.00", "Revenue minus Expenses"] }
	- Output: {
		["", "Amount", "Notes"],
		["Revenue", "100,000.00", "Total revenue for the period"],
		["  Product Sales", "80,000.00", ""],
		["  Service Income", "20,000.00", ""],
		["Expenses", "50,000.00", "Total expenses"],
		["  Salaries", "30,000.00", "Staff salaries"],
		["  Rent", "10,000.00", ""],
		["  Utilities", "5,000.00", ""],
		["  Miscellaneous", "5,000.00", "Various expenses"],
		["Net Income", "50,000.00", "Revenue minus Expenses"] }
	- Input: {
		["Investments", "Industry", "Country", "Quantity", "Cost", "Fair Value"],
		["TechCorp Inc.", "Technology", "USA", "1000", "50,000", "60,000"],
		["Series A Preferred Stock", "", "", "500", "25,000", "30,000"],
		["Series B Preferred Stock", "", "", "500", "25,000", "30,000"] }
	- Output: {
		["Investments", "Industry", "Country", "Quantity", "Cost", "Fair Value"],
		["TechCorp Inc.", "Technology", "USA", "1000", "50,000", "60,000"],
		["  Series A Preferred Stock", "", "", "500", "25,000", "30,000"],
		["  Series B Preferred Stock", "", "", "500", "25,000", "30,000"] }
	- Input: {
		["Beginning Balance", "$17,460,601"], 
		["Capital contributions", "97,862"],
		["Net investment loss",	"(5,776)"],
		["General Partner's preference", "391,447"],
		["Carried interest", "(3,069,715)"],
		["Net unrealized gain", "(381,490)"],
		["Ending Balance", "$14,492,929"] }
	- Output: {
		["Beginning Balance", "$17,460,601"], 
		["  Capital contributions", "97,862"],
		["  Net investment loss",	"(5,776)"],
		["  General Partner's preference", "391,447"],
		["  Carried interest", "(3,069,715)"],
		["  Net unrealized gain", "(381,490)"],
		["Ending Balance", "$14,492,929"] }
	- Input: {
		["Software-as-a-Service", "", "", "", "", ""],
		["Bravely, Inc.", "", "", "", "", ""],
		["Preferred stock", "6,914,563", "", "", "", ""],
		["Carefeed Inc.", "", "", "", "", ""],
		["Preferred stock", "3,323,684", "", "", "", ""],
		["Common stock", "812,893", "", "", "", ""],
		["Total Software-as-a-Service", "85,949,587", "", "", "", ""] }
	- Output: {
		["Software-as-a-Service", "", "", "", "", ""],
		["  Bravely, Inc.", "", "", "", "", ""],
		["    Preferred stock", "6,914,563", "", "", "", ""],
		["  Carefeed Inc.", "", "", "", "", ""],
		["    Preferred stock", "3,323,684", "", "", "", ""],
		["    Common stock", "812,893", "", "", "", ""],
		["Total Software-as-a-Service", "85,949,587", "", "", "", ""] }
	- Input: {
		["Net assets", "", ""], 
		["Net increase", "1234", "1234"] }
	- Output: {
		["Net assets", "", ""], 
		["Net increase", "1234", "1234"] }
	- Input: {
		["Columbia", "", ""],
		["e-Commerce, "", ""], 
		["Percentage of Assets", "1234", "1234"],
		["OpenPass", "1234", "1234"],
		["SAFE", "1234", "1234"],
		["PhotoZino", "1234", 1234"],
		["Series A", "1234", "1234"],				
		["Total e-commerce", "1234", "1234"],
		["Total Columbia", "", ""] }
	- Output: {
		["Columbia", "", ""],
		["  e-Commerce, "", ""], 
		["    Percentage of Assets", "1234", "1234"],
		["    OpenPass", "1234", "1234"],
		["      SAFE", "1234", "1234"],
		["    PhotoZino", "1234", 1234"],
		["      Series A", "1234", "1234"],		
		["  Total e-commerce", "1234", "1234"],
		["Total Columbia", "", ""] }
	- Input: {
		["Assets",""],
		["Investments, at estimated fair value (cost $139,019,302)", "$246,485,547"],
		["Cash and Cash Equivalents", "2,193,154"],
		["Contribution Receivable", "117,499"],
		["Escrow Receivable", "18,172"],
		["Total assets", "248,814,372"],
		["Liabilities", ""],
		["Due to Related Party", "608,288"],
		["Accounts Payable", "49,541"],
		["Contributions in Advance", "107,500"],
		["Total liabilities", "765,329"] }
	- Output: {
		["Assets",""],
		["  Investments, at estimated fair value (cost $139,019,302)", "$246,485,547"],
		["  Cash and Cash Equivalents", "2,193,154"],
		["  Contribution Receivable", "117,499"],
		["  Escrow Receivable", "18,172"],
		["Total assets", "248,814,372"],
		["Liabilities", ""],
		["  Due to Related Party", "608,288"],
		["  Accounts Payable", "49,541"],
		["  Contributions in Advance", "107,500"],
		["Total liabilities", "765,329"] }
		
- ### Feedback Loop ###
  - After completing Step 3, verify that:
	- **Verify that every array in the tableList has the same number of input and output elements.**
	- Only the 'row header' elements (first element of each array) have been considered for indentation.
	- No other elements in any array have been indented or modified in Step 3.
	- The indentation has been applied correctly according to the specified instructions.
---

## Final Review ##
- After completing all steps, thoroughly review the entire 'output' to ensure:
  - All transformations have been correctly applied according to the rules. 
  - No unintended changes or errors are present.
  - **Verify that every array in the tableList has the same number of input and output elements.**
  - The data is consistent and accurate, reflecting all specified formatting requirements.
---

## Output ##
- The response should be a valid JSON object that contains an "output" property, which is a list of arrays of strings.
- Ensure the output can be parsed into a valid JSON object.
\`\`\`
`

export class DocuPandaJsonIndenterTool extends BaseFlowTool {

  name = 'json-indenter'
  description = 'Data Validation and Formatting (standard)'
  precedents: FlowTool[] = [] //= [ inject(DocuPandaTool), inject(DocuPandaAsImageTool), inject(DocuPandaRemoveWatermarkTool) ]
  type = 'transformer'
  apiType = AgentType.completions
  outputType = 'json'
  assistantId = 'asst_kxIxCOBgOqaHr2C9FxP6G8xP'
  instructions = INSTRUCTIONS
  parameters: JsonFormData
  startTime = 0

  constructor(flowService?: FileflowServiceInterface) {
    super()
    if(flowService)
      this.initialize(flowService)
  }
  initialize(flowService: FileflowServiceInterface) {
    this.flowService = flowService
    this.parameters = defaultTransformerParameters
    this.precedents.push(
      new DocuPandaTool(this.flowService), 
      new DocuPandaAsImageTool(this.flowService),
      new DocuPandaRemoveWatermarkTool(this.flowService)
    )
    return this
  }

  
  shouldPublish(): boolean {
      return true
  }
  async execute(
      file: FlowFile,
      last: FlowStep | null,
      params?: any)
  {
    this.startTime = Date.now()
    
    // Get the output from the last step
    let output = last ? await this.flowService.getFileContents(last.storageName) : null
    if (!output)
      throw new Error(`No output found in last step ${last?.name} ${last?.outputURL}`)

    if (params?.saveGlobally) {
      this.instructions = params.instructions
      this.flowService.updateTool(this, { instructions: params.instructions })
    }
    const instructions = params?.instructions || this.instructions
    const model = params?.model || 'gpt-4o'

    // Correct the indentation of the tables in the output
    const pages = output.data.result.pages
    for (let i = 0; i < pages?.length; i++) {
      const page = pages[i];
      const sections = page.sections;
      for (let j = 0; j < sections.length; j++) {
        if (sections[j].type === 'table') {
          this.flowService.log(`Found table in page ${i} section ${j}`);
          let tableRows = sections[j].tableList;
          //const start = getTableStartRow(tableRows); //check for blank rows; remove if present
          //tableRows = start > 0 ? tableRows.slice(start) : tableRows;
          //this.flowService.log('Table Start Row is ', start);
          const cleanRows = this.cleanArray(tableRows)
          const fixed_rows = await this.correctDocupandaIndentation(file, JSON.stringify(cleanRows), instructions, model, params.production);
          // replace the tableList with the corrected rows
          sections[j].tableList = fixed_rows;
        }
      }
    }
    // Update the FlowStep
    await this.flowService.uploadAnalysis(this, file, output);    
  }

    cleanArray(rows: string[][]) {
        //const NON_PRINTABLE_REGEX = /[^\x20-\x7E]/gu
        let result = []
        for(let i = 0; i < rows.length; i++) {
            const row = rows[i]
            result.push(row.map((str: string) => {
                // 12/15/24 - Docupanda is adding an underscore in cells with no values
                // Remove it
                if(str === UNDERSCORE) 
                    return ''
                return str.replaceAll(NON_PRINTABLE_REGEX, '')}
            ))        
        }
        return result
    }

    private async correctDocupandaIndentation(file: FlowFile, tableList: any, instructions: string, model: string, isProduction: boolean) {
        const messages = [
            {
                role: 'user',
                content: `${tableList}`,
            }
        ]
        const { usage, result } = await executeAzureTool(
            this,
            messages,
            this.flowService,
            { instructions, model, isProduction }
        )
        
        const actual_output = JSON.parse(result).output
        this.flowService.log('Output: ', { instructions, result, actual_output })

        const userId = this.flowService.getUserId()
        const user = await this.flowService.getUser(userId)
        const prompt = await this.flowService.updateStepPrompt(file, this.name, tableList, JSON.stringify(actual_output))
        const costs = CHATGPT_COSTS[model]
        const azureCost = costs 
            ? (usage.prompt_tokens * costs.input / 1000) + (usage.completion_tokens * costs.output / 1000)
            : 0
        await this.flowService.logPipelineStep({
            user: userId,
            groupDocId: user.groupDocId, 
            fileDocId: file.docId,
            fileName: file.fileName,
            loggedAt: new Date(),
            fileSize: file.size,
            stepName: this.name,
            promptDocId: prompt.id,
            azureModelUsed: model, 
            azureTokensUsed: usage.total_tokens,
            azureCost,
            elapsedMsec: Date.now() - this.startTime
        })        
        return actual_output
    }

    getContentDisposition(fileName: string) {
        return  'inline;'
    }

    generateOutputPreview(output: any) {
        return generateDocupandaPageOutputPreview(output)
    }
}
