Whatever message this page gives is out now! Go check it out!
<cfscript>
docService = documentService();
documents = docService.load({ path: expandPath("./docs/") });
smallSegments = docService.split(documents, { chunkSize: 200 });
largeSegments = docService.split(documents, { chunkSize: 1000 });
writeOutput(arrayLen(smallSegments) & " vs " & arrayLen(largeSegments));
</cfscript>Parameter | Role |
chunkSize | Target maximum size of a chunk (tests use character-oriented sizes, for example 500 or 1000). |
chunkOverlap | Characters shared between consecutive chunks when the splitter supports overlap (for example 50, 100, or 0). |
splitterType | Algorithm: how boundaries are chosen (recursive, sentence, character, line, paragraph, word, regex, and in some APIs custom). |
recursive | Used in Simple RAG options alongside splitterType (example: recursive: false with character in tests). |
regexPattern | Required for regex splitters: pattern used to split (example: newline pattern in tests). |
separators | For recursive, optional ordered list of delimiter strings (example: double newline then single newline). |
Splitter type | Splits on | Best for |
character | Fixed character count | Simple text; fast but can cut words mid-sentence. |
word | Word boundaries | Better than character for readability; no semantic awareness. |
sentence | Sentence boundaries | Prose documents; preserves complete sentences. |
line | Line breaks | Log files, CSV-like data, code. |
paragraph | Paragraph breaks | Documents where each paragraph is a self-contained unit. |
recursive | Paragraphs → sentences → words → characters (cascade) | General purpose; the recommended default for most documents. |
hierarchical | Document structure (headings, sections) | Structured documents like manuals, wikis, legal docs. |
semantic | Semantic similarity between sentences | Dense technical content where meaning shifts within paragraphs. |
regex | Custom regular expression pattern | Custom-formatted documents. |
<cfscript>
docService = documentService();
documents = docService.load({ path: expandPath("./docs/") });
types = ["recursive", "character", "line", "word", "sentence", "paragraph"];
for (splitterType in types) {
opts = { chunkSize: 500, chunkOverlap: 50, splitterType: splitterType };
if (splitterType == "regex") {
opts.regexPattern = "\.\s+";
}
segments = docService.split(documents, opts);
writeOutput(splitterType & ": " & arrayLen(segments) & " segments<br>");
}
</cfscript>Content type | Recommended splitter |
Most prose documents (articles, FAQs, manuals) | recursive — the recommended general-purpose default. Cascades from paragraphs down to characters. |
Narrative text, customer support transcripts | sentence — preserves complete sentences; avoids mid-sentence cuts. |
Log files, CSV-like data, code listings | line — splits on line breaks; each line is a discrete unit. |
Structured documents (manuals, wikis, legal docs) | hierarchical — respects document structure such as headings and sections. |
Dense technical content where meaning shifts within paragraphs | semantic — groups sentences by semantic similarity rather than syntactic boundaries. |
Custom-formatted documents with known delimiters | regex — provide regexPattern to define split points precisely. |
Simple text; performance-sensitive pipelines | character — fastest; does not preserve word or sentence boundaries. |
Domain-specific structure (legal clauses, code functions) | Custom splitter UDF |
<cfscript>
docService = documentService();
documents = docService.load({ path: expandPath("./docs/long-doc.txt") });
segments = docService.split(documents, {
chunkSize: 200,
chunkOverlap: 0
});
writeOutput("segment count: " & arrayLen(segments));
</cfscript><cfscript>
chatModel = ChatModel({
provider: "openai",
modelName: "gpt-4o-mini",
apiKey: application.apiKey,
temperature: 0.7
});
docsDir = expandPath("./docs/");
embeddingModel = {
provider: "openai",
modelName: "text-embedding-3-small",
apiKey: application.apiKey
};
vectorStore = VectorStore({
provider: "INMEMORY",
embeddingModel: embeddingModel
});
customSplitter = function(required struct document) {
var paragraphs = listToArray(document.text, chr(10) & chr(10));
var segments = [];
for (var para in paragraphs) {
if (len(trim(para)) > 0) {
arrayAppend(segments, {
text: trim(para),
metadata: document.metadata
});
}
}
return segments;
};
ragBot = agent({
CHATMODEL: chatModel,
ingestion: {
source: docsDir,
recursive: false,
includePatterns: ["*.pdf"],
documentSplitter: {
splitterType: "recursive",
chunkSize: 500,
chunkOverlap: 50,
separators: [ chr(10) & chr(10), chr(10) ]
},
vectorStoreIngestor: { vectorStore: vectorStore }
},
retrievalAugmentor: {
queryRouter: {
contentRetrievers: [{
vectorStore: vectorStore,
maxResults: 3,
minScore: 0.3,
description: "Knowledge base"
}]
}
}
});
ragBot.ingest();
answer = ragBot.chat("How to extend Adobe subscription using prepaid card?");
writeOutput(answer.message);
</cfscript>Metadata key | Description |
splitterType | The splitter algorithm used to produce this segment. |
splitTimestamp | Time the split operation was recorded. |
documentIndex | Zero-based index of the source document in the input array. |
chunkIndex | Zero-based position of this segment within its source document. |
globalSegmentIndex | Zero-based position of this segment across all documents in the split call. |
totalChunks | arrayLen(segments) — total segments produced in this split call. |
chunkSize | The chunkSize value used for this split. |
chunkOverlap | The chunkOverlap value used for this split. |
startOffset | Character offset in the source document where this segment begins. |
endOffset | Character offset in the source document where this segment ends. Always greater than startOffset. |
<cfscript>
docService = documentService();
documents = docService.load({ path: expandPath("./docs/") });
segments = docService.split(documents, { chunkSize: 300 });
meta = segments[1].metadata;
writeDump(meta);
</cfscript><cfscript>
docService = documentService();
documents = docService.load({
path: expandPath("./docs/"),
pattern: "*.pdf"
});
segments = docService.split(documents);
writeOutput(arrayLen(segments) & " segments");
</cfscript><cfscript>
docService = documentService();
documents = docService.load({
path: expandPath("./docs/"),
pattern: "*.pdf"
});
segments = docService.split(documents, {
chunkSize: 500,
chunkOverlap: 50
});
writeDump(segments[1]);
</cfscript><cfscript>
docService = documentService();
documents = docService.load({
path: expandPath("./docs/"),
pattern: "*.pdf"
});
segments = docService.split(documents, {
chunkSize: 500,
chunkOverlap: 50,
splitterType: "recursive"
});
writeOutput(arrayLen(segments) & " segments");
</cfscript><cfscript>
docService = documentService();
documents = docService.load({
path: expandPath("./docs/"),
pattern: "*.pdf"
});
segments = docService.split(documents, {
chunkSize: 1000,
chunkOverlap: 100
});
writeOutput("documents=" & arrayLen(documents) & " segments=" & arrayLen(segments));
</cfscript><cfscript>
chatModel = ChatModel({
provider: "openai",
modelName: "gpt-4o-mini",
apiKey: application.apiKey,
temperature: 0.7
});
vectorStore = VectorStore({
provider: "INMEMORY",
embeddingModel: {
provider: "openai",
modelName: "text-embedding-3-small",
apiKey: application.apiKey
}
});
docsDir = expandPath("./docs/");
ragService = agent({
CHATMODEL: chatModel,
ingestion: {
source: docsDir & "long-doc.txt",
documentSplitter: {},
vectorStoreIngestor: { vectorStore: vectorStore }
},
retrievalAugmentor: {
queryRouter: {
contentRetrievers: [{
vectorStore: vectorStore,
maxResults: 5,
minScore: 0.3,
description: "Knowledge base"
}]
}
}
});
result = ragService.ingest();
writeOutput(result.segmentsIngested);
</cfscript><cfscript>
chatModel = ChatModel({
provider: "openai",
modelName: "gpt-4o-mini",
apiKey: application.apiKey,
temperature: 0.7
});
vectorStore = VectorStore({
provider: "INMEMORY",
embeddingModel: {
provider: "openai",
modelName: "text-embedding-3-small",
apiKey: application.apiKey
}
});
docsDir = expandPath("./docs/");
ragService = agent({
CHATMODEL: chatModel,
ingestion: {
source: docsDir,
documentSplitter: {
chunkSize: 500,
chunkOverlap: 50
},
vectorStoreIngestor: { vectorStore: vectorStore }
},
retrievalAugmentor: {
queryRouter: {
contentRetrievers: [{
vectorStore: vectorStore,
maxResults: 5,
minScore: 0.3,
description: "Knowledge base"
}]
}
}
});
ragService.ingest();
answer = ragService.chat("How to update TIN?");
writeOutput(answer.message);
</cfscript><cfscript>
ragService = agent({
CHATMODEL: chatModel,
ingestion: {
source: docsDir,
documentSplitter: {
splitterType: "sentence",
chunkSize: 500,
chunkOverlap: 50
},
vectorStoreIngestor: { vectorStore: vectorStore }
},
retrievalAugmentor: { ... }
});
</cfscript><cfscript>
chatModel = ChatModel({
provider: "openai",
modelName: "gpt-4o-mini",
apiKey: application.apiKey,
temperature: 0.7
});
vectorStore = VectorStore({
provider: "INMEMORY",
embeddingModel: {
provider: "openai",
modelName: "text-embedding-3-small",
apiKey: application.apiKey
}
});
// Same layout as QA: template runs from DocumentSplitter\, Documents\ is alongside it
docsDir = expandPath("./docs/");
ragService = agent({
CHATMODEL: chatModel,
ingestion: {
source: docsDir & "sample.txt",
documentSplitter: {
splitterType: "regex",
regexPattern: "\\n"
},
vectorStoreIngestor: { vectorStore: vectorStore }
},
retrievalAugmentor: {
queryRouter: {
contentRetrievers: [{
vectorStore: vectorStore,
maxResults: 5,
minScore: 0.3,
description: "Knowledge base"
}]
}
}
});
ragService.ingest();
answer = ragService.chat("What is the plot of study in scarlet?");
writeOutput(answer.message);
</cfscript><cfscript>
chatModel = ChatModel({
provider: "openai",
modelName: "gpt-4o-mini",
apiKey: application.apiKey,
temperature: 0.7
});
vectorStore = VectorStore({
provider: "INMEMORY",
embeddingModel: {
provider: "openai",
modelName: "text-embedding-3-small",
apiKey: application.apiKey
}
});
// Same layout as QA: template runs from DocumentSplitter\, Documents\ is alongside it
docsDir = expandPath("./docs/");
ragService = agent({
CHATMODEL: chatModel,
ingestion: {
source: docsDir & "sample.txt",
documentSplitter: {
splitterType: "recursive",
separators: [ chr(10) & chr(10), chr(10) ]
},
vectorStoreIngestor: { vectorStore: vectorStore }
},
retrievalAugmentor: {
queryRouter: {
contentRetrievers: [{
vectorStore: vectorStore,
maxResults: 5,
minScore: 0.3,
description: "Knowledge base"
}]
}
}
});
ragService.ingest();
answer = ragService.chat("Who is Dr. Watson?");
writeOutput(answer.message);
</cfscript><cfscript>
chatModel = ChatModel({
provider: "openai",
modelName: "gpt-4o-mini",
apiKey: application.apiKey,
temperature: 0.7
});
vectorStore = VectorStore({
provider: "INMEMORY",
embeddingModel: {
provider: "openai",
modelName: "text-embedding-3-small",
apiKey: application.apiKey
}
});
docsDir = expandPath("./docs/");
ragService = simpleRAG(
docsDir,
chatModel,
{
vectorStore: vectorStore,
chunkSize: 500,
chunkOverlap: 50,
splitterType: "character",
recursive: false
}
);
ragService.ingest();
answer = ragService.ask("How to extend Adobe subscription?");
writeOutput(answer.message);
</cfscript><cfscript>
chatModel = ChatModel({
provider: "openai",
modelName: "gpt-4o-mini",
apiKey: application.apiKey,
temperature: 0.7
});
vectorStore = VectorStore({
provider: "INMEMORY",
embeddingModel: {
provider: "openai",
modelName: "text-embedding-3-small",
apiKey: application.apiKey
}
});
docsDir = expandPath("./docs/");
ragService = simpleRAG(
docsDir,
chatModel,
{
vectorStore: vectorStore,
chunkSize: 500,
chunkOverlap: 200,
splitterType: "sentence",
recursive: false
}
);
ragService.ingest();
answer = ragService.ask("How to respond to Adobe support ticket?");
writeOutput(answer.message);
</cfscript>