Friday, October 17, 2025

Javascript Module 80

  Module 80 : Javascript Saminer.

Design choices I made (reasoning)

Use handwritten char loop (FSM style) — more control, easier to handle longest matches, escapes in strings, and comments. This approach is commonly used in lexers because it's predictable and fast. 








Represent tokens as {type, value, start, end, line, col} objects.

Provide a token spec (simple map of keyword names to token types) so you can change language surface easily.

Expose incremental API (nextToken) for parser integration and tokenizeAll() for quick usage.


JavaScript Scanner 

Save as scanner.js 

// scanner.js // A simple, extensible JavaScript scanner/lexer implementation. // Token types: IDENT, NUMBER, STRING, OP, PUNC, KEYWORD, EOF, COMMENT class Scanner { constructor(source, options = {}) { this.source = source; this.pos = 0; this.len = source.length; this.line = 1; this.col = 1; this.options = options; // keywords set (customize) this.keywords = new Set(options.keywords || [ "if","else","while","for","function","return","var","let","const","true","false","null" ]); } // Utility: current char peekChar(offset = 0) { return this.source[this.pos + offset] || null; } // Utility: advance one char and update position advance() { const ch = this.source[this.pos++]; if (ch === '\n') { this.line++; this.col = 1; } else this.col++; return ch; } // Return token object helper makeToken(type, value, startPos, endPos, startLine, startCol) { return { type, value, start: startPos, end: endPos, line: startLine, col: startCol }; } // Skip whitespace and optionally collect comments skipSpacesAndComments() { while (this.pos < this.len) { const ch = this.peekChar(); // whitespace if (/\s/.test(ch)) { this.advance(); continue; } // single-line comment // if (ch === '/' && this.peekChar(1) === '/') { this.advance(); this.advance(); let comment = ''; while (this.pos < this.len && this.peekChar() !== '\n') comment += this.advance(); // return comment token if requested if (this.options.includeComments) { return this.makeToken('COMMENT', comment, this.pos - comment.length - 2, this.pos, this.line, this.col); } continue; } // multi-line comment /* ... */ if (ch === '/' && this.peekChar(1) === '*') { this.advance(); this.advance(); let comment = ''; while (this.pos < this.len && !(this.peekChar() === '*' && this.peekChar(1) === '/')) { comment += this.advance(); } // consume closing */ if (this.peekChar() === '*' && this.peekChar(1) === '/') { this.advance(); this.advance(); } else { throw new SyntaxError(`Unterminated comment at line ${this.line} col ${this.col}`); } if (this.options.includeComments) { return this.makeToken('COMMENT', comment, this.pos - comment.length - 4, this.pos, this.line, this.col); } continue; } break; } return null; } // Read identifier or keyword: [a-zA-Z_$][a-zA-Z0-9_$]* readIdentifierOrKeyword() { const startPos = this.pos; const startLine = this.line; const startCol = this.col; let id = ''; let ch = this.peekChar(); while (ch && /[A-Za-z0-9_$]/.test(ch)) { id += this.advance(); ch = this.peekChar(); } const type = this.keywords.has(id) ? 'KEYWORD' : 'IDENT'; return this.makeToken(type, id, startPos, this.pos, startLine, startCol); } // Read number (int or float, simple): digits optionally .digits readNumber() { const startPos = this.pos; const startLine = this.line; const startCol = this.col; let num = ''; let ch = this.peekChar(); while (ch && /[0-9]/.test(ch)) { num += this.advance(); ch = this.peekChar(); } // fraction if (ch === '.' && /[0-9]/.test(this.peekChar(1))) { num += this.advance(); // consume '.' ch = this.peekChar(); while (ch && /[0-9]/.test(ch)) { num += this.advance(); ch = this.peekChar(); } } // exponents (optional) e or E ch = this.peekChar(); if (ch && (ch === 'e' || ch === 'E')) { let look = this.peekChar(1); if (/[0-9+\-]/.test(look)) { num += this.advance(); // e/E if (this.peekChar() === '+' || this.peekChar() === '-') num += this.advance(); while (/[0-9]/.test(this.peekChar())) num += this.advance(); } } return this.makeToken('NUMBER', num, startPos, this.pos, startLine, startCol); } // Read string literal with escapes readString() { const quote = this.advance(); // ' or " const startPos = this.pos - 1; const startLine = this.line; const startCol = this.col - 1; let str = ''; while (this.pos < this.len) { let ch = this.advance(); if (ch === '\\') { // escape const next = this.advance(); switch (next) { case 'n': str += '\n'; break; case 'r': str += '\r'; break; case 't': str += '\t'; break; case 'b': str += '\b'; break; case 'f': str += '\f'; break; case 'v': str += '\v'; break; case '0': str += '\0'; break; case 'x': { // hex: \xNN const a = this.advance(), b = this.advance(); str += String.fromCharCode(parseInt(a + b, 16)); break; } case 'u': { // unicode \uNNNN const hex = this.source.substr(this.pos, 4); if (!/^[0-9A-Fa-f]{4}$/.test(hex)) throw new SyntaxError(`Invalid \\u escape at ${this.line}:${this.col}`); str += String.fromCharCode(parseInt(hex, 16)); this.advance(); this.advance(); this.advance(); this.advance(); break; } default: str += next; } } else if (ch === quote) { // end string return this.makeToken('STRING', str, startPos, this.pos, startLine, startCol); } else { str += ch; } } throw new SyntaxError(`Unterminated string literal at ${startLine}:${startCol}`); } // Try to match multi-char operators first (longest match) readOperatorOrPunct() { const startPos = this.pos; const startLine = this.line; const startCol = this.col; // operators/punctuation list (ordered by length) const multi = ["===", "!==", "==", "!=", ">=", "<=", "&&", "||", "=>", "+=", "-=", "*=", "/=", "%="]; for (const op of multi) { if (this.source.startsWith(op, this.pos)) { for (let i = 0; i < op.length; i++) this.advance(); return this.makeToken('OP', op, startPos, this.pos, startLine, startCol); } } // single char operators/punct const ch = this.advance(); if ("+-*/%=<>&|!^?:~".includes(ch)) return this.makeToken('OP', ch, startPos, this.pos, startLine, startCol); if ("()[]{};,.".includes(ch)) return this.makeToken('PUNC', ch, startPos, this.pos, startLine, startCol); // if we get here, unexpected char throw new SyntaxError(`Unexpected character '${ch}' at ${startLine}:${startCol}`); } // The main method to get the next token nextToken() { // possibly return comments if includeComments option is set const commentToken = this.skipSpacesAndComments(); if (commentToken) return commentToken; if (this.pos >= this.len) return this.makeToken('EOF', null, this.pos, this.pos, this.line, this.col); const ch = this.peekChar(); // identifier or keyword if (/[A-Za-z_$]/.test(ch)) return this.readIdentifierOrKeyword(); // number (starts with digit) if (/[0-9]/.test(ch)) return this.readNumber(); // string if (ch === '"' || ch === "'") return this.readString(); // operators & punctuation return this.readOperatorOrPunct(); } // Peek token without consuming (consumes but restores state) peek() { const save = { pos: this.pos, line: this.line, col: this.col }; const t = this.nextToken(); this.pos = save.pos; this.line = save.line; this.col = save.col; return t; } // Tokenize entire input into an array tokenizeAll() { const tokens = []; while (true) { const t = this.nextToken(); tokens.push(t); if (t.type === 'EOF') break; } return tokens; } } // Export for node / testing if (typeof module !== 'undefined' && module.exports) module.exports = { Scanner };


Example usage & tests

const { Scanner } = require('./scanner'); const src = ` // test program const x = 3.14 * (y + 2) - func("hello\\n", 42); /* multi line comment */ `; const s = new Scanner(src, { includeComments: true }); const toks = s.tokenizeAll(); console.log(toks);

Expect tokens like KEYWORD(const) IDENT(x) OP(=) NUMBER(3.14) OP(*) PUNC('(') IDENT(y) ... STRING("hello\n") and COMMENT tokens if included.


important to understand

1. Longest match / greedy matching




When matching operators like == and ===, the scanner must try the longest operators first to avoid splitting. That's why the multi array is ordered by length. Many lexers use deterministic finite automata or sorted rule lists to ensure the longest match. 

2. Why a char-by-char FSM vs regex

Regular expressions can be concise but get awkward for nested escapes (strings), or complex comment rules and position tracking. An FSM gives you precise control and simpler error reporting (precise line/column). For learning, FSM is clearer. Many production lexers are FSM-based. 


3. Handling strings & escapes

You must consume \ and interpret escapes. Beware of \uXXXX and \xNN. If you don't handle them properly you'll mis-parse or produce wrong string values.

4. Numbers

Basic support: integers, decimals and exponent notation 1e-5.languages allow hex/octal/binary and underscores in literals; add those if you need them.

5. Comments & whitespace

Lexers typically either drop whitespace/comments or emit them as trivia tokens for tools (formatters, linters). includeComments option lets you choose.

6. Position tracking

Keep line and col updated on \n — essential for useful parser errors and source maps.

7. Token objects

Include start & end character offsets (useful for editors and tooling), plus line/col for human errors.

8. Extensibility

Add token filters (e.g., classify identifiers as keywords).

Add a Scanner option for strict mode (more token types) or different language keyword sets.


Advanced topics & research pointers

Performance: compilers use state machines compiled into tight loops. If performance matters, measure (benchmark) with  inputs. Avoid constructing many short strings; reuse buffers if necessary. (See lexer implementations such as Oxc & production parser libraries for JS). 



Lexer generators: Tools similar to flex produce lexers from regex rules and actions. There are JS libraries inspired by flex. Use these for large grammars to reduce boilerplate. 

Streaming / incremental lexing: For editors you may need incremental lexing (lex only the changed region and reuse previous tokens). That requires careful state representation.

Unicode & encodings:lexers must handle Unicode identifiers and surrogate pairs correctly. This example assumes ASCII-like identifiers; adapt to Unicode classes for production.


Common pitfalls & debugging tips

Unterminated string/comment: always report exact line/column. Our code throws SyntaxError with position.

Mixing token rules order: if you test single char before multi-char ops you'll split == into = =. Order matters.

Position off by one: track start positions before advancing.

Ambiguous tokens: if 123abc can be a number+identifier or a single invalid token depending on the language — define rules early.


How to integrate with a parser

Parser repeatedly calls scanner.nextToken() or peek() and uses token type and value to apply grammar rules.

For recursive-descent parsers, peek() helps decide which production to follow without consuming.


research & examples 

 lexer description and responsibilities (lexer/tokenizer/scanner). 

 tokenizer tutorials and math evaluator tokenizers. 

Notes about design patterns (FSM vs regex) and community discussion. 


If building a language, 








add:

Full Unicode identifier handling (use \p{ID_Start} via Unicode regex or codepoint checks).

Hex/Oct/Bin numeric literals and numeric separators (underscores).

Template literals (for JS-style backticks) which require nested lexing.

Token positions suitable for source maps (byte offsets if you need UTF-8 correctness).

Tests: write unit tests for edge cases (unterminated strings, invalid escapes, huge inputs).

If you want a lexer generator approach in JS, I can show an example with a flex-style library or moo (a popular JS lexer) and how to map rules to actions.

If you’re building an editor plugin, consider incremental lexing techniques and returning minimal edit diffs.


No comments:

Post a Comment

Javascript Module 80

  Javascript   Module 80 If You want To Earn Certificate For My  All   Course Then Contact Me At My  Contact  Page   then I Will Take A Test...