diff --git a/csv/_io.ts b/csv/_io.ts index 5e80959c55b0..f3fbb4e4a498 100644 --- a/csv/_io.ts +++ b/csv/_io.ts @@ -6,7 +6,7 @@ import { codePointLength } from "./_shared.ts"; -/** Options for {@linkcode parseRecord}. */ +/** Options for {@linkcode parseLine} and {@linkcode parseRecord}. */ export interface ReadOptions { /** Character which separates values. * @@ -56,13 +56,31 @@ export interface LineReader { isEOF(): boolean; } -export async function parseRecord( +/** + * Synchronous CSV record primitive. + * + * Parses a complete CSV record (one or more lines joined by `\n`, since a + * quoted field may legally span lines) into an array of fields. Both + * {@linkcode parseRecord} (async, line-pulling) and the top-level + * {@linkcode parse} build on top of this function so the field-level rules + * live in exactly one place. + * + * Returns: + * - `string[]` when the input is a complete record + * - `null` when the input ends inside an unclosed quoted field and the + * caller has more input it can append (set `atEof` to `true` to force + * an EOF decision instead of returning `null`) + * + * Throws {@linkcode SyntaxError} for hard syntax errors (bare quote in a + * non-quoted field, extraneous `"` after a closing quote, unclosed quoted + * field at EOF without `lazyQuotes`). + */ +export function parseLine( fullLine: string, - reader: LineReader, options: ReadOptions, - zeroBasedRecordStartLine: number, - zeroBasedLine: number = zeroBasedRecordStartLine, -): Promise> { + zeroBasedRecordStartLine: number = 0, + atEof: boolean = true, +): string[] | null { // line starting with comment character is ignored if (options.comment && fullLine[0] === options.comment) { return []; @@ -78,6 +96,24 @@ export async function parseRecord( const separatorLen = options.separator.length; let recordBuffer = ""; const fieldIndexes = [] as number[]; + + // Map an absolute position in `fullLine` to the (line, column) where it + // lives, accounting for embedded `\n` from joined multi-line records. The + // returned line number is offset from the record's first line; column is in + // code-points (matches the existing error message format). + const locate = (absPos: number): { line: number; col: number } => { + let line = zeroBasedRecordStartLine; + let lastNewline = -1; + for (let i = 0; i < absPos; i++) { + if (fullLine[i] === "\n") { + line++; + lastNewline = i; + } + } + const col = codePointLength(fullLine.slice(lastNewline + 1, absPos)); + return { line, col }; + }; + parseField: while (true) { if (options.trimLeadingSpace) { line = line.trimStart(); @@ -94,13 +130,13 @@ export async function parseRecord( if (!options.lazyQuotes) { const j = field.indexOf(quote); if (j >= 0) { - const col = codePointLength( - fullLine.slice(0, fullLine.length - line.slice(j).length), + const { line: errLine, col } = locate( + fullLine.length - line.slice(j).length, ); throw new SyntaxError( createBareQuoteErrorMessage( zeroBasedRecordStartLine, - zeroBasedLine, + errLine, col, ), ); @@ -140,52 +176,54 @@ export async function parseRecord( recordBuffer += quote; } else { // `"*` sequence (invalid non-escaped quote). - const col = codePointLength( - fullLine.slice(0, fullLine.length - line.length - quoteLen), + const { line: errLine, col } = locate( + fullLine.length - line.length - quoteLen, ); throw new SyntaxError( createQuoteErrorMessage( zeroBasedRecordStartLine, - zeroBasedLine, + errLine, col, ), ); } - } else if (line.length > 0 || !reader.isEOF()) { - // Hit end of line (copy all data so far). - recordBuffer += line; - const r = await reader.readLine(); - line = r ?? ""; // This is a workaround for making this module behave similarly to the encoding/csv/reader.go. - fullLine = line; - if (r === null) { - // Abrupt end of file (EOF or error). - if (!options.lazyQuotes) { - const col = codePointLength(fullLine); + } else { + // No more quotes on this line. Record continues onto the next line + // (the caller has already joined them with `\n` so `line` already + // contains the rest of the buffered input). If we're not yet at EOF, + // signal the caller to feed more input by returning `null`. + if (!atEof) { + return null; + } + // At EOF: same as the old reader-based path's "abrupt end of file" + // branches. The old code distinguished two cases by whether `line` + // (the unprocessed remainder) was empty when EOL hit: + // - `line` empty → "abrupt EOF" branch, column = end of original + // input (the quote opened but the entire body was consumed + // before EOF); applies to inputs with an odd number of quotes + // on a single line. + // - `line` non-empty → would have fallen through to a final + // readLine, which returned null and reset `fullLine` to `""`, + // so column was 0 on the line after the last consumed segment. + if (!options.lazyQuotes) { + if (line.length === 0) { throw new SyntaxError( createQuoteErrorMessage( zeroBasedRecordStartLine, - zeroBasedLine, - col, + zeroBasedRecordStartLine, + codePointLength(fullLine), ), ); } - fieldIndexes.push(recordBuffer.length); - break parseField; - } - zeroBasedLine++; - recordBuffer += "\n"; // preserve line feed (This is because TextProtoReader removes it.) - } else { - // Abrupt end of file (EOF on error). - if (!options.lazyQuotes) { - const col = codePointLength(fullLine); + let errLine = zeroBasedRecordStartLine; + for (let i = 0; i < fullLine.length; i++) { + if (fullLine[i] === "\n") errLine++; + } throw new SyntaxError( - createQuoteErrorMessage( - zeroBasedRecordStartLine, - zeroBasedLine, - col, - ), + createQuoteErrorMessage(zeroBasedRecordStartLine, errLine, 0), ); } + recordBuffer += line; fieldIndexes.push(recordBuffer.length); break parseField; } @@ -201,6 +239,50 @@ export async function parseRecord( return result; } +/** + * Async wrapper that builds on {@linkcode parseLine}: pulls additional lines + * from `reader` whenever the current accumulated line ends inside an unclosed + * quoted field, then defers all field-level parsing to `parseLine`. + * + * This keeps the streaming caller (`CsvParseStream`) on a single shared + * primitive without re-implementing any field/quote rules. + */ +export async function parseRecord( + fullLine: string, + reader: LineReader, + options: ReadOptions, + zeroBasedRecordStartLine: number, +): Promise> { + let accumulated = fullLine; + while (true) { + const result = parseLine( + accumulated, + options, + zeroBasedRecordStartLine, + reader.isEOF(), + ); + if (result !== null) { + return result; + } + // parseLine returned null → record continues onto another line. + const next = await reader.readLine(); + if (next === null) { + // Reader claimed it was not at EOF but yielded null — force a final + // pass with atEof=true so parseLine throws/handles EOF consistently. + const eofResult = parseLine( + accumulated, + options, + zeroBasedRecordStartLine, + true, + ); + // parseLine with atEof=true cannot return null; this is a defensive + // narrowing for the type system. + return eofResult ?? []; + } + accumulated += "\n" + next; + } +} + export function createBareQuoteErrorMessage( zeroBasedRecordStartLine: number, zeroBasedLine: number, diff --git a/csv/parse.ts b/csv/parse.ts index 2ad28afd411e..f471fc51f9a0 100644 --- a/csv/parse.ts +++ b/csv/parse.ts @@ -3,17 +3,75 @@ import { convertRowToObject, - createBareQuoteErrorMessage, - createQuoteErrorMessage, + parseLine as parseLineInternal, type ParseResult, type ReadOptions, type RecordWithColumn, } from "./_io.ts"; -import { codePointLength } from "./_shared.ts"; export type { ParseResult, RecordWithColumn }; -const BYTE_ORDER_MARK = "\ufeff"; +const BYTE_ORDER_MARK = ""; + +/** + * Parse a single CSV record into its fields. + * + * `parseLine` is the synchronous primitive that `parse` and `CsvParseStream` + * are both built on. It is exported so callers that already own line + * splitting (for example, after `TextLineStream`) can reuse the same field + * rules without spinning up a parser class. + * + * Multi-line quoted fields are supported: pass the joined record (each + * source line separated by `\n`) and the function will treat the embedded + * newlines as field content. + * + * @example Usage + * ```ts + * import { parseLine } from "@std/csv/parse"; + * import { assertEquals } from "@std/assert/equals"; + * + * assertEquals(parseLine("a,b,c"), ["a", "b", "c"]); + * assertEquals(parseLine(`"a","b,c","d"`), ["a", "b,c", "d"]); + * ``` + * + * @example Custom separator + * ```ts + * import { parseLine } from "@std/csv/parse"; + * import { assertEquals } from "@std/assert/equals"; + * + * assertEquals(parseLine("a\tb\tc", { separator: "\t" }), ["a", "b", "c"]); + * ``` + * + * @param line The single CSV record to parse. May contain embedded `\n` + * characters inside quoted fields. + * @param options Parsing options. Same shape as the read-side options + * accepted by {@linkcode parse}. + * @returns The fields parsed from the record. + */ +export function parseLine( + line: string, + options: Omit = + {}, +): string[] { + const { separator = ",", trimLeadingSpace = false, comment, lazyQuotes } = + options; + const stripped = line.startsWith(BYTE_ORDER_MARK) ? line.slice(1) : line; + // Treat a single trailing CR/LF/CRLF as a record terminator (callers that + // forgot to trim should not see a phantom empty trailing field). + const normalized = stripped.endsWith("\r\n") + ? stripped.slice(0, -2) + : stripped.endsWith("\n") || stripped.endsWith("\r") + ? stripped.slice(0, -1) + : stripped; + const readOptions: ReadOptions = { + separator, + trimLeadingSpace, + ...(comment !== undefined ? { comment } : {}), + ...(lazyQuotes !== undefined ? { lazyQuotes } : {}), + }; + const result = parseLineInternal(normalized, readOptions, 0, true); + return result ?? []; +} class Parser { #input = ""; @@ -21,9 +79,9 @@ class Parser { #options: { separator: string; trimLeadingSpace: boolean; - comment: string | undefined; - lazyQuotes: boolean | undefined; - fieldsPerRecord: number | undefined; + comment?: string; + lazyQuotes?: boolean; + fieldsPerRecord?: number; }; constructor({ separator = ",", @@ -35,9 +93,9 @@ class Parser { this.#options = { separator, trimLeadingSpace, - comment, - lazyQuotes, - fieldsPerRecord, + ...(comment !== undefined ? { comment } : {}), + ...(lazyQuotes !== undefined ? { lazyQuotes } : {}), + ...(fieldsPerRecord !== undefined ? { fieldsPerRecord } : {}), }; } #readLine(): string | null { @@ -71,138 +129,37 @@ class Parser { return this.#cursor >= this.#input.length; } #parseRecord(zeroBasedStartLine: number): string[] | null { - let fullLine = this.#readLine(); - if (fullLine === null) return null; - if (fullLine.length === 0) { - return []; - } - - let zeroBasedLine = zeroBasedStartLine; - - // line starting with comment character is ignored - if (this.#options.comment && fullLine[0] === this.#options.comment) { + const first = this.#readLine(); + if (first === null) return null; + if (first.length === 0) { return []; } - let line = fullLine; - const quote = '"'; - const quoteLen = quote.length; - const separatorLen = this.#options.separator.length; - let recordBuffer = ""; - const fieldIndexes = [] as number[]; - parseField: while (true) { - if (this.#options.trimLeadingSpace) { - line = line.trimStart(); - } - - if (line.length === 0 || !line.startsWith(quote)) { - // Non-quoted string field - const i = line.indexOf(this.#options.separator); - let field = line; - if (i >= 0) { - field = field.substring(0, i); - } - // Check to make sure a quote does not appear in field. - if (!this.#options.lazyQuotes) { - const j = field.indexOf(quote); - if (j >= 0) { - const col = codePointLength( - fullLine.slice(0, fullLine.length - line.slice(j).length), - ); - throw new SyntaxError( - createBareQuoteErrorMessage( - zeroBasedStartLine, - zeroBasedLine, - col, - ), - ); - } - } - recordBuffer += field; - fieldIndexes.push(recordBuffer.length); - if (i >= 0) { - line = line.substring(i + separatorLen); - continue parseField; - } - break parseField; - } else { - // Quoted string field - line = line.substring(quoteLen); - while (true) { - const i = line.indexOf(quote); - if (i >= 0) { - // Hit next quote. - recordBuffer += line.substring(0, i); - line = line.substring(i + quoteLen); - if (line.startsWith(quote)) { - // `""` sequence (append quote). - recordBuffer += quote; - line = line.substring(quoteLen); - } else if (line.startsWith(this.#options.separator)) { - // `","` sequence (end of field). - line = line.substring(separatorLen); - fieldIndexes.push(recordBuffer.length); - continue parseField; - } else if (0 === line.length) { - // `"\n` sequence (end of line). - fieldIndexes.push(recordBuffer.length); - break parseField; - } else if (this.#options.lazyQuotes) { - // `"` sequence (bare quote). - recordBuffer += quote; - } else { - // `"*` sequence (invalid non-escaped quote). - const col = codePointLength( - fullLine.slice(0, fullLine.length - line.length - quoteLen), - ); - throw new SyntaxError( - createQuoteErrorMessage(zeroBasedStartLine, zeroBasedLine, col), - ); - } - } else if (line.length > 0 || !(this.#isEOF())) { - // Hit end of line (copy all data so far). - recordBuffer += line; - const r = this.#readLine(); - line = r ?? ""; // This is a workaround for making this module behave similarly to the encoding/csv/reader.go. - fullLine = line; - if (r === null) { - // Abrupt end of file (EOF or error). - if (!this.#options.lazyQuotes) { - const col = codePointLength(fullLine); - throw new SyntaxError( - createQuoteErrorMessage( - zeroBasedStartLine, - zeroBasedLine, - col, - ), - ); - } - fieldIndexes.push(recordBuffer.length); - break parseField; - } - zeroBasedLine++; - recordBuffer += "\n"; // preserve line feed (This is because TextProtoReader removes it.) - } else { - // Abrupt end of file (EOF on error). - if (!this.#options.lazyQuotes) { - const col = codePointLength(fullLine); - throw new SyntaxError( - createQuoteErrorMessage(zeroBasedStartLine, zeroBasedLine, col), - ); - } - fieldIndexes.push(recordBuffer.length); - break parseField; - } - } + // Defer all field-level parsing to the shared primitive. If the line ends + // inside an unclosed quoted field, accumulate the next line and re-parse; + // we own line iteration here, so the primitive's `atEof` signal tells us + // when to give up. + let accumulated = first; + while (true) { + const result = parseLineInternal( + accumulated, + this.#options, + zeroBasedStartLine, + this.#isEOF(), + ); + if (result !== null) return result; + const next = this.#readLine(); + if (next === null) { + // Force the EOF decision (will throw unless lazyQuotes is set). + return parseLineInternal( + accumulated, + this.#options, + zeroBasedStartLine, + true, + ) ?? []; } + accumulated += "\n" + next; } - const result = [] as string[]; - let preIdx = 0; - for (const i of fieldIndexes) { - result.push(recordBuffer.slice(preIdx, i)); - preIdx = i; - } - return result; } parse(input: string): string[][] { this.#input = input.startsWith(BYTE_ORDER_MARK) ? input.slice(1) : input; @@ -240,7 +197,6 @@ class Parser { } else if (options.fieldsPerRecord === 0) { _nbFields = "UNINITIALIZED"; } else { - // TODO: Should we check if it's a valid integer? _nbFields = options.fieldsPerRecord; } diff --git a/csv/parse_test.ts b/csv/parse_test.ts index 1912758b8100..fed3cbe8b495 100644 --- a/csv/parse_test.ts +++ b/csv/parse_test.ts @@ -5,7 +5,7 @@ // Copyright 2018-2026 the Deno authors. MIT license. import { assert, assertEquals, assertThrows } from "@std/assert"; -import { parse, type ParseOptions } from "./parse.ts"; +import { parse, parseLine, type ParseOptions } from "./parse.ts"; import type { AssertTrue, IsExact } from "@std/testing/types"; const BYTE_ORDER_MARK = "\ufeff"; @@ -1023,3 +1023,106 @@ Deno.test({ } }, }); + +Deno.test({ + name: "parseLine() splits a simple comma-separated record", + fn() { + assertEquals(parseLine("a,b,c"), ["a", "b", "c"]); + }, +}); + +Deno.test({ + name: "parseLine() handles quoted fields with embedded commas", + fn() { + assertEquals(parseLine(`"a","b,c","d"`), ["a", "b,c", "d"]); + }, +}); + +Deno.test({ + name: "parseLine() handles escaped quotes inside quoted fields", + fn() { + assertEquals(parseLine(`"a ""word""","plain"`), [`a "word"`, "plain"]); + }, +}); + +Deno.test({ + name: "parseLine() supports a custom separator", + fn() { + assertEquals( + parseLine("a\tb\tc", { separator: "\t" }), + ["a", "b", "c"], + ); + }, +}); + +Deno.test({ + name: "parseLine() trims leading whitespace when trimLeadingSpace is set", + fn() { + assertEquals( + parseLine(" a, b, c", { trimLeadingSpace: true }), + ["a", "b", "c"], + ); + }, +}); + +Deno.test({ + name: "parseLine() strips a leading byte-order mark", + fn() { + assertEquals(parseLine("a,b,c"), ["a", "b", "c"]); + }, +}); + +Deno.test({ + name: "parseLine() strips a single trailing newline", + fn() { + assertEquals(parseLine("a,b,c\n"), ["a", "b", "c"]); + assertEquals(parseLine("a,b,c\r\n"), ["a", "b", "c"]); + assertEquals(parseLine("a,b,c\r"), ["a", "b", "c"]); + }, +}); + +Deno.test({ + name: "parseLine() returns embedded newlines from a multi-line quoted field", + fn() { + assertEquals(parseLine(`"a\nb",c`), ["a\nb", "c"]); + }, +}); + +Deno.test({ + name: "parseLine() throws on a bare quote in an unquoted field", + fn() { + assertThrows( + () => parseLine(`a,b"c,d`), + SyntaxError, + `bare " in non-quoted-field`, + ); + }, +}); + +Deno.test({ + name: "parseLine() tolerates bare quotes when lazyQuotes is set", + fn() { + assertEquals( + parseLine(`a,b"c,d`, { lazyQuotes: true }), + ["a", `b"c`, "d"], + ); + }, +}); + +Deno.test({ + name: "parseLine() returns an empty array for a comment line", + fn() { + assertEquals(parseLine("# header line", { comment: "#" }), []); + }, +}); + +Deno.test({ + name: "parseLine() throws on an unclosed quoted field", + fn() { + assertThrows( + () => parseLine(`"unclosed`), + SyntaxError, + `extraneous or missing " in quoted-field`, + ); + }, +});