chore: improve delimiter (#8552)

langgenius · Sep 19, 2024 · 7411bcf · 7411bcf
1 parent d96f5ba
commit 7411bcf
Show file tree

Hide file tree

Showing 5 changed files with 103 additions and 12 deletions.
diff --git a/web/app/components/datasets/create/step-two/escape.ts b/web/app/components/datasets/create/step-two/escape.ts
@@ -0,0 +1,18 @@
+function escape(input: string): string {
+  if (!input || typeof input !== 'string')
+    return ''
+
+  const res = input
+    .replaceAll('\\', '\\\\')
+    .replaceAll('\0', '\\0')
+    .replaceAll('\b', '\\b')
+    .replaceAll('\f', '\\f')
+    .replaceAll('\n', '\\n')
+    .replaceAll('\r', '\\r')
+    .replaceAll('\t', '\\t')
+    .replaceAll('\v', '\\v')
+    .replaceAll('\'', '\\\'')
+  return res
+}
+
+export default escape
diff --git a/web/app/components/datasets/create/step-two/index.tsx b/web/app/components/datasets/create/step-two/index.tsx
@@ -1,5 +1,5 @@
 'use client'
-import React, { useEffect, useLayoutEffect, useRef, useState } from 'react'
+import React, { useCallback, useEffect, useLayoutEffect, useRef, useState } from 'react'
 import { useTranslation } from 'react-i18next'
 import { useContext } from 'use-context-selector'
 import { useBoolean } from 'ahooks'
@@ -13,6 +13,8 @@ import { groupBy } from 'lodash-es'
 import PreviewItem, { PreviewType } from './preview-item'
 import LanguageSelect from './language-select'
 import s from './index.module.css'
+import unescape from './unescape'
+import escape from './escape'
 import cn from '@/utils/classnames'
 import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, FileIndexingEstimateResponse, FullDocumentDetail, IndexingEstimateParams, NotionInfo, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets'
 import {
@@ -78,6 +80,8 @@ enum IndexingType {
   ECONOMICAL = 'economy',
 }
 
+const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'
+
 const StepTwo = ({
   isSetting,
   documentDetail,
@@ -110,8 +114,11 @@ const StepTwo = ({
   const previewScrollRef = useRef<HTMLDivElement>(null)
   const [previewScrolled, setPreviewScrolled] = useState(false)
   const [segmentationType, setSegmentationType] = useState<SegmentType>(SegmentType.AUTO)
-  const [segmentIdentifier, setSegmentIdentifier] = useState('\\n')
-  const [max, setMax] = useState(5000) // default chunk length
+  const [segmentIdentifier, doSetSegmentIdentifier] = useState(DEFAULT_SEGMENT_IDENTIFIER)
+  const setSegmentIdentifier = useCallback((value: string) => {
+    doSetSegmentIdentifier(value ? escape(value) : DEFAULT_SEGMENT_IDENTIFIER)
+  }, [])
+  const [max, setMax] = useState(4000) // default chunk length
   const [overlap, setOverlap] = useState(50)
   const [rules, setRules] = useState<PreProcessingRule[]>([])
   const [defaultConfig, setDefaultConfig] = useState<Rules>()
@@ -183,7 +190,7 @@ const StepTwo = ({
   }
   const resetRules = () => {
     if (defaultConfig) {
-      setSegmentIdentifier((defaultConfig.segmentation.separator === '\n' ? '\\n' : defaultConfig.segmentation.separator) || '\\n')
+      setSegmentIdentifier(defaultConfig.segmentation.separator)
       setMax(defaultConfig.segmentation.max_tokens)
       setOverlap(defaultConfig.segmentation.chunk_overlap)
       setRules(defaultConfig.pre_processing_rules)
@@ -217,7 +224,7 @@ const StepTwo = ({
       const ruleObj = {
         pre_processing_rules: rules,
         segmentation: {
-          separator: segmentIdentifier === '\\n' ? '\n' : segmentIdentifier,
+          separator: unescape(segmentIdentifier),
           max_tokens: max,
           chunk_overlap: overlap,
         },
@@ -394,7 +401,7 @@ const StepTwo = ({
     try {
       const res = await fetchDefaultProcessRule({ url: '/datasets/process-rule' })
       const separator = res.rules.segmentation.separator
-      setSegmentIdentifier((separator === '\n' ? '\\n' : separator) || '\\n')
+      setSegmentIdentifier(separator)
       setMax(res.rules.segmentation.max_tokens)
       setOverlap(res.rules.segmentation.chunk_overlap)
       setRules(res.rules.pre_processing_rules)
@@ -411,7 +418,7 @@ const StepTwo = ({
       const separator = rules.segmentation.separator
       const max = rules.segmentation.max_tokens
       const overlap = rules.segmentation.chunk_overlap
-      setSegmentIdentifier((separator === '\n' ? '\\n' : separator) || '\\n')
+      setSegmentIdentifier(separator)
       setMax(max)
       setOverlap(overlap)
       setRules(rules.pre_processing_rules)
@@ -616,12 +623,22 @@ const StepTwo = ({
                 <div className={s.typeFormBody}>
                   <div className={s.formRow}>
                     <div className='w-full'>
-                      <div className={s.label}>{t('datasetCreation.stepTwo.separator')}</div>
+                      <div className={s.label}>
+                        {t('datasetCreation.stepTwo.separator')}
+                        <Tooltip
+                          popupContent={
+                            <div className='max-w-[200px]'>
+                              {t('datasetCreation.stepTwo.separatorTip')}
+                            </div>
+                          }
+                        />
+                      </div>
                       <input
                         type="text"
                         className={s.input}
-                        placeholder={t('datasetCreation.stepTwo.separatorPlaceholder') || ''} value={segmentIdentifier}
-                        onChange={e => setSegmentIdentifier(e.target.value)}
+                        placeholder={t('datasetCreation.stepTwo.separatorPlaceholder') || ''}
+                        value={segmentIdentifier}
+                        onChange={e => doSetSegmentIdentifier(e.target.value)}
                       />
                     </div>
                   </div>

diff --git a/web/app/components/datasets/create/step-two/unescape.ts b/web/app/components/datasets/create/step-two/unescape.ts
@@ -0,0 +1,54 @@
+// https://github.com/iamakulov/unescape-js/blob/master/src/index.js
+
+/**
+ * \\ - matches the backslash which indicates the beginning of an escape sequence
+ * (
+ *   u\{([0-9A-Fa-f]+)\} - first alternative; matches the variable-length hexadecimal escape sequence (\u{ABCD0})
+ * |
+ *   u([0-9A-Fa-f]{4}) - second alternative; matches the 4-digit hexadecimal escape sequence (\uABCD)
+ * |
+ *   x([0-9A-Fa-f]{2}) - third alternative; matches the 2-digit hexadecimal escape sequence (\xA5)
+ * |
+ *   ([1-7][0-7]{0,2}|[0-7]{2,3}) - fourth alternative; matches the up-to-3-digit octal escape sequence (\5 or \512)
+ * |
+ *   (['"tbrnfv0\\]) - fifth alternative; matches the special escape characters (\t, \n and so on)
+ * |
+ *   \U([0-9A-Fa-f]+) - sixth alternative; matches the 8-digit hexadecimal escape sequence used by python (\U0001F3B5)
+ * )
+ */
+const jsEscapeRegex = /\\(u\{([0-9A-Fa-f]+)\}|u([0-9A-Fa-f]{4})|x([0-9A-Fa-f]{2})|([1-7][0-7]{0,2}|[0-7]{2,3})|(['"tbrnfv0\\]))|\\U([0-9A-Fa-f]{8})/g
+
+const usualEscapeSequences: Record<string, string> = {
+  '0': '\0',
+  'b': '\b',
+  'f': '\f',
+  'n': '\n',
+  'r': '\r',
+  't': '\t',
+  'v': '\v',
+  '\'': '\'',
+  '"': '"',
+  '\\': '\\',
+}
+
+const fromHex = (str: string) => String.fromCodePoint(parseInt(str, 16))
+const fromOct = (str: string) => String.fromCodePoint(parseInt(str, 8))
+
+const unescape = (str: string) => {
+  return str.replace(jsEscapeRegex, (_, __, varHex, longHex, shortHex, octal, specialCharacter, python) => {
+    if (varHex !== undefined)
+      return fromHex(varHex)
+    else if (longHex !== undefined)
+      return fromHex(longHex)
+    else if (shortHex !== undefined)
+      return fromHex(shortHex)
+    else if (octal !== undefined)
+      return fromOct(octal)
+    else if (python !== undefined)
+      return fromHex(python)
+    else
+      return usualEscapeSequences[specialCharacter]
+  })
+}
+
+export default unescape
diff --git a/web/i18n/en-US/dataset-creation.ts b/web/i18n/en-US/dataset-creation.ts
@@ -87,7 +87,8 @@ const translation = {
     custom: 'Custom',
     customDescription: 'Customize chunks rules, chunks length, and preprocessing rules, etc.',
     separator: 'Delimiter',
-    separatorPlaceholder: 'For example, newline (\\\\n) or special separator (such as "***")',
+    separatorTip: 'A delimiter is the character used to separate text. \\n\\n and \\n are commonly used delimiters for separating paragraphs and lines. Combined with commas (\\n\\n,\\n), paragraphs will be segmented by lines when exceeding the maximum chunk length. You can also use special delimiters defined by yourself (e.g. ***).',
+    separatorPlaceholder: '\\n\\n for separating paragraphs; \\n for separating lines',
     maxLength: 'Maximum chunk length',
     overlap: 'Chunk overlap',
     overlapTip: 'Setting the chunk overlap can maintain the semantic relevance between them, enhancing the retrieve effect. It is recommended to set 10%-25% of the maximum chunk size.',

diff --git a/web/i18n/zh-Hans/dataset-creation.ts b/web/i18n/zh-Hans/dataset-creation.ts
@@ -87,7 +87,8 @@ const translation = {
     custom: '自定义',
     customDescription: '自定义分段规则、分段长度以及预处理规则等参数',
     separator: '分段标识符',
-    separatorPlaceholder: '例如换行符（\n）或特定的分隔符（如 "***"）',
+    separatorTip: '分隔符是用于分隔文本的字符。\\n\\n 和 \\n 是常用于分隔段落和行的分隔符。用逗号连接分隔符（\\n\\n,\\n），当段落超过最大块长度时，会按行进行分割。你也可以使用自定义的特殊分隔符（例如 ***）。',
+    separatorPlaceholder: '\\n\\n 用于分段；\\n 用于分行',
     maxLength: '分段最大长度',
     overlap: '分段重叠长度',
     overlapTip: '设置分段之间的重叠长度可以保留分段之间的语义关系，提升召回效果。建议设置为最大分段长度的10%-25%',