Conversation
node-zerox/src/openAI.ts
Outdated
| import axios from "axios"; | ||
| import { nanoid } from "nanoid"; | ||
|
|
||
| const markdownToJson = async (markdownString: string) => { |
There was a problem hiding this comment.
nit: i would move this to the utils folder
8f2a9de to
fab05db
Compare
|
@ZeeshanZulfiqarAli can you make this optional. with something like a I think the response should probably always include the pages, and then have this as an optional output: Also if you come up with a better name than chunks I'm game. |
|
Why is this doing image>markdown>JSON instead of image>JSON? |
|
@batmanscode this is specifically for chunking the OCR results into a JSON array, rather than running JSON extraction. Primarily for chunking / indexing use cases. We're parsing the Markdown to chunk the page into elements (i.e. |
Ah ok I see, thanks 😃 |
This PR adds the ability to parse markdown into a json object.
For the following markdown:
This JSON is produced:
[ { "id": "03qdyA5ROy5EQB-WaKEDo", "page": 1, "type": "heading", "value": "Deloitte." }, { "id": "TaSFQxq1WE5c0Z6Ey5Crz", "page": 1, "parentId": "03qdyA5ROy5EQB-WaKEDo", "type": "heading", "value": "Quality System Audit for BioTech Innovations (Pty) Ltd" }, { "id": "m8N1Hi_wXBPZa9sNMX_qc", "page": 1, "parentId": "TaSFQxq1WE5c0Z6Ey5Crz", "type": "heading", "value": "Opening Meeting Sign-in Sheet" }, { "id": "aM1oWUjreyW7I7qMhgmqq", "page": 1, "parentId": "m8N1Hi_wXBPZa9sNMX_qc", "type": "text", "value": "Audit Date: 02 October 2024 Time: 06h30 Supplier: BioTech Innovations (Pty) Ltd; 67 River Rd, Kensington, Johannesburg, Gauteng, 2094 South Africa. Contact Person: Kathy Margaret Phone Number: +14 22 045 4952" }, { "id": "8h3dkSXVdr8smZZc-4ttN", "page": 1, "parentId": "m8N1Hi_wXBPZa9sNMX_qc", "type": "text", "value": "Opening Meeting Agenda:" }, { "id": "z5VvXnAdLsXF2MZYRotd5", "page": 1, "parentId": "m8N1Hi_wXBPZa9sNMX_qc", "type": "list", "value": [ { "id": "D3TpFf6bwXAO2DbT0nxtj", "page": 1, "type": "text", "value": "Introductions" }, { "id": "XvZKI0Gs5uCpRDcyBQELn", "page": 1, "type": "text", "value": "Review of audit agenda" }, { "id": "RidWUv1CjLSdk6XvTbgwU", "page": 1, "type": "text", "value": "Confirmation of availability for required persons" } ] }, { "id": "SwXj9Gx98CgnizEURc5YY", "page": 1, "parentId": "m8N1Hi_wXBPZa9sNMX_qc", "type": "text", "value": "Opening Meeting Attendees:" }, { "id": "2BreVoV6ptcefEADutqJV", "page": 1, "parentId": "m8N1Hi_wXBPZa9sNMX_qc", "type": "table", "value": { "headers": [ { "value": "No.", "id": "_wFjO1laIvhXpCVJLHBEG" }, { "value": "Print Name", "id": "DdKaPNWWWC0vllcgllwqK" }, { "value": "Job Title", "id": "81_M1ohyeRavAtldkiBx1" }, { "value": "Email", "id": "9HWpmkk7Eskz0RSwsJAS1" }, { "value": "Signature", "id": "pXm6dctuxr9GFm0YFecrU" } ], "rows": [ { "_wFjO1laIvhXpCVJLHBEG": "1", "DdKaPNWWWC0vllcgllwqK": "Anna Pojanvis", "81_M1ohyeRavAtldkiBx1": "CTO", "9HWpmkk7Eskz0RSwsJAS1": "anna@getomni.ai", "pXm6dctuxr9GFm0YFecrU": "a p" }, { "_wFjO1laIvhXpCVJLHBEG": "2", "DdKaPNWWWC0vllcgllwqK": "Tyler Maran", "81_M1ohyeRavAtldkiBx1": "CEO", "9HWpmkk7Eskz0RSwsJAS1": "tyler@getomni.ai", "pXm6dctuxr9GFm0YFecrU": "" }, { "_wFjO1laIvhXpCVJLHBEG": "3", "DdKaPNWWWC0vllcgllwqK": "Kathy Margaret", "81_M1ohyeRavAtldkiBx1": "Associate", "9HWpmkk7Eskz0RSwsJAS1": "kmargaret@qaconsultants.com", "pXm6dctuxr9GFm0YFecrU": "" }, { "_wFjO1laIvhXpCVJLHBEG": "4", "DdKaPNWWWC0vllcgllwqK": "Mark Ding", "81_M1ohyeRavAtldkiBx1": "Eng", "9HWpmkk7Eskz0RSwsJAS1": "mark@getomni.ai", "pXm6dctuxr9GFm0YFecrU": "" }, { "_wFjO1laIvhXpCVJLHBEG": "5", "DdKaPNWWWC0vllcgllwqK": "", "81_M1ohyeRavAtldkiBx1": "", "9HWpmkk7Eskz0RSwsJAS1": "", "pXm6dctuxr9GFm0YFecrU": "" } ] } }, { "id": "KmwROFSUArfod3PQ1R0Km", "page": 1, "parentId": "m8N1Hi_wXBPZa9sNMX_qc", "type": "text", "value": "QAC Auditor: David Thompson, Lead Quality Auditor, NTA Services on behalf of BioTech Innovations (Biopharmaceuticals)." }, { "id": "jMM4sM-05M1G5E734rBUQ", "page": 1, "parentId": "m8N1Hi_wXBPZa9sNMX_qc", "type": "text", "value": "Page 1 of 7" }, { "id": "zQrMLcCMrpjIKXRmGEwoA", "page": 1, "parentId": "m8N1Hi_wXBPZa9sNMX_qc", "type": "text", "value": "DELOITTE QUALITY ASSURANCE CONSULTANTS, LLC 450 Oceanview Drive, Suite 200 - Santa Monica, CA 90405 - PHONE (800) 555-1234 (310) 555-7890 - FAX (310) 555-4567 Website: www.qaconsultants.com - Email: contact@qaconsultants.com" } ]