diff --git a/benchmarks/performance/words.bench.ts b/benchmarks/performance/words.bench.ts new file mode 100644 index 000000000..820dc7c88 --- /dev/null +++ b/benchmarks/performance/words.bench.ts @@ -0,0 +1,15 @@ +import { bench, describe } from 'vitest'; +import { words as wordsToolkit } from 'es-toolkit'; +import { words as wordLodash_ } from 'lodash'; + +describe('Performance Comparison: es-toolkit words vs lodash words', () => { + const testString = 'This is a test string with different_cases and UPPERCASE words 🚀 and more symbols'; + + bench('es-toolkit words', () => { + wordsToolkit(testString); + }); + + bench('lodash words', () => { + wordLodash_(testString); + }); +}); diff --git a/docs/ja/reference/string/words.md b/docs/ja/reference/string/words.md new file mode 100644 index 000000000..ab1d55e78 --- /dev/null +++ b/docs/ja/reference/string/words.md @@ -0,0 +1,44 @@ +# words + +文字列を単語単位で分割し、配列として返します。ASCIIおよびUnicode文字をすべて単語として認識できます。 + +## インターフェース + +```ts +function words(str: string): string[]; +``` + +### パラメータ + +- `str` (`string`): 単語に分割する文字列です。 + +### 戻り値 + +(`string[]`): 文字列を単語単位で分割した配列です。 + +## 例 + +```typescript +words('fred, barney, & pebbles'); +// => ['fred', 'barney', 'pebbles'] + +words('camelCaseHTTPRequest🚀'); +// => ['camel', 'Case', 'HTTP', 'Request', '🚀'] + +words('Lunedì 18 Set'); +// => ['Lunedì', '18', 'Set'] +``` + +## Lodash 互換性 + +`es-toolkit/compat` から `words` をインポートすると、Lodash と互換になります。 + +- `words`では、文字列を分割する正規表現を変更するために、第二引数`pattern`を提供できます。 +- `words`は、第一引数が文字列でない場合、自動的に文字列に変換します。 + +```typescript +import { words } from 'es-toolkit/compat'; + +words('fred, barney, & pebbles', /[^, ]+/g); +// 戻り値: ['fred', 'barney', '&', 'pebbles'] +``` diff --git a/docs/ko/reference/string/words.md b/docs/ko/reference/string/words.md new file mode 100644 index 000000000..e3cb72793 --- /dev/null +++ b/docs/ko/reference/string/words.md @@ -0,0 +1,44 @@ +# words + +문자열을 단어 단위로 분리해 배열로 반환해요. ASCII 및 유니코드 문자를 모두 단어로 인식할 수 있어요. + +## 인터페이스 + +```ts +function words(str: string): string[]; +``` + +### 파라미터 + +- `str` (`string`): 단어로 분리할 문자열. + +### 반환 값 + +(`string[]`): 문자열을 단어 단위로 분리한 배열. + +## 예시 + +```typescript +words('fred, barney, & pebbles'); +// => ['fred', 'barney', 'pebbles'] + +words('camelCaseHTTPRequest🚀'); +// => ['camel', 'Case', 'HTTP', 'Request', '🚀'] + +words('Lunedì 18 Set'); +// => ['Lunedì', '18', 'Set'] +``` + +## Lodash 호환성 + +`es-toolkit/compat`에서 `chunk`를 가져오면 lodash와 호환돼요. + +- `words`에서 문자열을 분리하는 정규식을 바꾸기 위해서 두 번째 인자 `pattern`을 제공할 수 있어요. +- `words`는 첫 번째 인자가 문자열이 아닌 경우, 자동으로 문자열로 바꿔요. + +```typescript +import { words } from 'es-toolkit/compat'; + +words('fred, barney, & pebbles', /[^, ]+/g); +// 반환 값: ['fred', 'barney', '&', 'pebbles'] +``` diff --git a/docs/reference/string/words.md b/docs/reference/string/words.md new file mode 100644 index 000000000..4578ff27f --- /dev/null +++ b/docs/reference/string/words.md @@ -0,0 +1,44 @@ +# words + +Splits a string into an array of words. It can recognize both ASCII and Unicode characters as words. + +## Signature + +```ts +function words(str: string): string[]; +``` + +### Parameters + +- `str` (`string`): The string to split into words. + +### Returns + +(`string[]`): An array of words extracted from the string. + +## Examples + +```typescript +words('fred, barney, & pebbles'); +// => ['fred', 'barney', 'pebbles'] + +words('camelCaseHTTPRequest🚀'); +// => ['camel', 'Case', 'HTTP', 'Request', '🚀'] + +words('Lunedì 18 Set'); +// => ['Lunedì', '18', 'Set'] +``` + +## Lodash Compatibility + +To ensure full compatibility with lodash, you can import `words` from `es-toolkit/compat`. + +- `words` also takes an optional second parameter, `pattern`, which allows you to define custom patterns for splitting the string. +- `words` will automatically convert the first argument to a string if it isn't one already. + +```typescript +import { words } from 'es-toolkit/compat'; + +words('fred, barney, & pebbles', /[^, ]+/g); +// Returns ['fred', 'barney', '&', 'pebbles'] +``` diff --git a/docs/zh_hans/reference/string/words.md b/docs/zh_hans/reference/string/words.md new file mode 100644 index 000000000..d7b3612ca --- /dev/null +++ b/docs/zh_hans/reference/string/words.md @@ -0,0 +1,44 @@ +# words + +将字符串拆分为单词数组。它可以识别 ASCII 和 Unicode 字符作为单词。 + +## 签名 + +```ts +function words(str: string): string[]; +``` + +### 参数 + +- `str` (`string`): 要拆分为单词的字符串。 + +### 返回值 + +(`string[]`): 从字符串中提取的单词数组。 + +## 示例 + +```typescript +words('fred, barney, & pebbles'); +// => ['fred', 'barney', 'pebbles'] + +words('camelCaseHTTPRequest🚀'); +// => ['camel', 'Case', 'HTTP', 'Request', '🚀'] + +words('Lunedì 18 Set'); +// => ['Lunedì', '18', 'Set'] +``` + +## Lodash 兼容性 + +从 `es-toolkit/compat` 中导入 `words` 以实现与 lodash 的完全兼容。 + +- `words` 还接受一个可选的第二个参数 `pattern`,允许您定义自定义模式来拆分字符串。 +- 如果第一个参数不是字符串,`words` 将自动将其转换为字符串。 + +```typescript +import { words } from 'es-toolkit/compat'; + +words('fred, barney, & pebbles', /[^, ]+/g); +// ['fred', 'barney', '&', 'pebbles'] +``` diff --git a/src/compat/array/differenceBy.spec.ts b/src/compat/array/differenceBy.spec.ts index d5b6e638b..65107ca66 100644 --- a/src/compat/array/differenceBy.spec.ts +++ b/src/compat/array/differenceBy.spec.ts @@ -14,12 +14,13 @@ describe('differenceBy', () => { expect(actual).toEqual([{ x: 2 }]); }); - it('should provide correct `iteratee` arguments', () => { + it('should provide correct iteratee arguments', () => { let args: any; - differenceBy([2.1, 1.2], [2.3, 3.4], function () { - // eslint-disable-next-line - args || (args = slice.call(arguments)); + differenceBy([2.1, 1.2], [2.3, 3.4], function (...rest: any[]) { + if (!args) { + args = slice.call(rest); + } }); expect(args).toEqual([2.3]); diff --git a/src/compat/string/startCase.ts b/src/compat/string/startCase.ts index 25810d1e3..8b41733a2 100644 --- a/src/compat/string/startCase.ts +++ b/src/compat/string/startCase.ts @@ -1,4 +1,4 @@ -import { getWords } from '../../string/_internal/getWords.ts'; +import { words as getWords } from '../../string/words.ts'; import { normalizeForCase } from '../_internal/normalizeForCase.ts'; /** diff --git a/src/compat/string/words.spec.ts b/src/compat/string/words.spec.ts new file mode 100644 index 000000000..385137dcc --- /dev/null +++ b/src/compat/string/words.spec.ts @@ -0,0 +1,44 @@ +import { describe, expect, it } from 'vitest'; +import { words } from './words'; + +describe('words', () => { + it('splits a simple ASCII comma-separated string into words', () => { + const result = words('fred, barney, & pebbles'); + expect(result).toEqual(['fred', 'barney', 'pebbles']); + }); + + it('splits a string with custom pattern', () => { + const result = words('fred, barney, & pebbles', /[^, ]+/g); + expect(result).toEqual(['fred', 'barney', '&', 'pebbles']); + }); + + it('returns an empty array when input is an empty string', () => { + const result = words(''); + expect(result).toEqual([]); + }); + + it('correctly handles a string with multiple number inputs', () => { + const result = words('+0 -3 +3 -4 +4'); + expect(result).toEqual(['0', '3', '3', '4', '4']); + }); + + it('splits a space-separated string into individual words', () => { + const result = words('split these words'); + expect(result).toEqual(['split', 'these', 'words']); + }); + + it('splits a string representation of an array', () => { + const result = words([1, 2, 3]); + expect(result).toEqual(['1', '2', '3']); + }); + + it('returns an empty array when input is undefined', () => { + const result = words(undefined); + expect(result).toEqual([]); + }); + + it('correctly handles a string with Unicode emojis and special characters', () => { + const result = words('example🚀with✨emojis💡and🔍special🌟characters'); + expect(result).toEqual(['example', '🚀', 'with', '✨', 'emojis', '💡', 'and', '🔍', 'special', '🌟', 'characters']); + }); +}); diff --git a/src/compat/string/words.ts b/src/compat/string/words.ts new file mode 100644 index 000000000..71fc3d13c --- /dev/null +++ b/src/compat/string/words.ts @@ -0,0 +1,22 @@ +import { CASE_SPLIT_PATTERN } from '../../string/words.ts'; +import { toString } from '../util/toString.ts'; + +/** + * Splits `string` into an array of its words. + * + * @param {string | object} str - The string or object that is to be split into words. + * @param {RegExp | string} [pattern] - The pattern to match words. + * @returns {string[]} - Returns the words of `string`. + * + * @example + * const wordsArray1 = words('fred, barney, & pebbles'); + * // => ['fred', 'barney', 'pebbles'] + * + */ +export function words(str?: string | object, pattern: RegExp | string = CASE_SPLIT_PATTERN): string[] { + const input = toString(str); + + const words = Array.from(input.match(pattern) ?? []); + + return words.filter(x => x !== ''); +} diff --git a/src/string/camelCase.ts b/src/string/camelCase.ts index 268a9a686..43a5db911 100644 --- a/src/string/camelCase.ts +++ b/src/string/camelCase.ts @@ -1,5 +1,5 @@ -import { getWords } from './_internal/getWords.ts'; import { capitalize } from './capitalize.ts'; +import { words as getWords } from './words.ts'; /** * Converts a string to camel case. diff --git a/src/string/constantCase.ts b/src/string/constantCase.ts index b399da1ac..f43dc452e 100644 --- a/src/string/constantCase.ts +++ b/src/string/constantCase.ts @@ -1,4 +1,4 @@ -import { getWords } from './_internal/getWords.ts'; +import { words as getWords } from './words.ts'; /** * Converts a string to constant case. diff --git a/src/string/index.ts b/src/string/index.ts index 861db5866..b54d143f8 100644 --- a/src/string/index.ts +++ b/src/string/index.ts @@ -17,3 +17,4 @@ export { trimStart } from './trimStart.ts'; export { unescape } from './unescape.ts'; export { upperCase } from './upperCase.ts'; export { upperFirst } from './upperFirst.ts'; +export { words } from './words.ts'; diff --git a/src/string/kebabCase.ts b/src/string/kebabCase.ts index e7fca3512..194f3fb2e 100644 --- a/src/string/kebabCase.ts +++ b/src/string/kebabCase.ts @@ -1,4 +1,4 @@ -import { getWords } from './_internal/getWords.ts'; +import { words as getWords } from './words.ts'; /** * Converts a string to kebab case. diff --git a/src/string/lowerCase.ts b/src/string/lowerCase.ts index 320310a01..0d0a1541d 100644 --- a/src/string/lowerCase.ts +++ b/src/string/lowerCase.ts @@ -1,4 +1,4 @@ -import { getWords } from './_internal/getWords.ts'; +import { words as getWords } from './words.ts'; /** * Converts a string to lower case. diff --git a/src/string/pascalCase.ts b/src/string/pascalCase.ts index 00c5cb405..014402f1f 100644 --- a/src/string/pascalCase.ts +++ b/src/string/pascalCase.ts @@ -1,5 +1,5 @@ -import { getWords } from './_internal/getWords.ts'; import { capitalize } from './capitalize.ts'; +import { words as getWords } from './words.ts'; /** * Converts a string to Pascal case. diff --git a/src/string/snakeCase.ts b/src/string/snakeCase.ts index e2e4bf3cb..bf86ededb 100644 --- a/src/string/snakeCase.ts +++ b/src/string/snakeCase.ts @@ -1,4 +1,4 @@ -import { getWords } from './_internal/getWords.ts'; +import { words as getWords } from './words.ts'; /** * Converts a string to snake case. diff --git a/src/string/startCase.ts b/src/string/startCase.ts index 66879ce18..54d28655f 100644 --- a/src/string/startCase.ts +++ b/src/string/startCase.ts @@ -1,4 +1,4 @@ -import { getWords } from './_internal/getWords.ts'; +import { words as getWords } from './words.ts'; /** * Converts the first character of each word in a string to uppercase and the remaining characters to lowercase. diff --git a/src/string/upperCase.ts b/src/string/upperCase.ts index 2192cf121..465848bf3 100644 --- a/src/string/upperCase.ts +++ b/src/string/upperCase.ts @@ -1,4 +1,4 @@ -import { getWords } from './_internal/getWords.ts'; +import { words as getWords } from './words.ts'; /** * Converts a string to upper case. diff --git a/src/string/_internal/getWords.spec.ts b/src/string/words.spec.ts similarity index 51% rename from src/string/_internal/getWords.spec.ts rename to src/string/words.spec.ts index 66265daca..50338ad00 100644 --- a/src/string/_internal/getWords.spec.ts +++ b/src/string/words.spec.ts @@ -1,74 +1,94 @@ import { describe, expect, it } from 'vitest'; -import { getWords } from './getWords'; +import { words } from './words'; + +describe('words', () => { + it('splits a simple ASCII comma-separated string into words', () => { + const result = words('fred, barney, & pebbles'); + expect(result).toEqual(['fred', 'barney', 'pebbles']); + }); + + it('returns an empty array when input is an empty string', () => { + const result = words(''); + expect(result).toEqual([]); + }); + + it('splits a space-separated string into individual words', () => { + const result = words('split these words'); + expect(result).toEqual(['split', 'these', 'words']); + }); + + it('splits Unicode emojis and special characters as separate words', () => { + const result = words('example🚀with✨emojis💡and🔍special🌟characters'); + expect(result).toEqual(['example', '🚀', 'with', '✨', 'emojis', '💡', 'and', '🔍', 'special', '🌟', 'characters']); + }); -describe('caseSplitPattern', () => { it('should match camelCase', () => { const str = 'camelCase'; - const matches = getWords(str); + const matches = words(str); expect(matches).toEqual(['camel', 'Case']); }); it('should match snake_case', () => { const str = 'snake_case'; - const matches = getWords(str); + const matches = words(str); expect(matches).toEqual(['snake', 'case']); }); it('should match kebab-case', () => { const str = 'kebab-case'; - const matches = getWords(str); + const matches = words(str); expect(matches).toEqual(['kebab', 'case']); }); it('should handle mixed formats', () => { const str = 'camelCase_snake_case-kebabCase'; - const matches = getWords(str); + const matches = words(str); expect(matches).toEqual(['camel', 'Case', 'snake', 'case', 'kebab', 'Case']); }); it('should match acronyms', () => { const str = 'HTTPRequest'; - const matches = getWords(str); + const matches = words(str); expect(matches).toEqual(['HTTP', 'Request']); }); it('should match special characters', () => { const str = 'special_characters@123'; - const matches = getWords(str); + const matches = words(str); expect(matches).toEqual(['special', 'characters', '123']); }); it('should handle leading and trailing whitespace', () => { const str = ' leading_and_trailing_whitespace '; - const matches = getWords(str); + const matches = words(str); expect(matches).toEqual(['leading', 'and', 'trailing', 'whitespace']); }); it('should handle underscores', () => { const str = 'underscore_case_example'; - const matches = getWords(str); + const matches = words(str); expect(matches).toEqual(['underscore', 'case', 'example']); }); it('should handle single character words', () => { const str = 'aB'; - const matches = getWords(str); + const matches = words(str); expect(matches).toEqual(['a', 'B']); }); it('should work with hyphens', () => { - expect(getWords('--FOO-BAR--')).toEqual(['FOO', 'BAR']); + expect(words('--FOO-BAR--')).toEqual(['FOO', 'BAR']); }); it('should work with numbers', () => { - expect(getWords('foo2bar')).toEqual(['foo', '2', 'bar']); + expect(words('foo2bar')).toEqual(['foo', '2', 'bar']); }); it('should match emojis', () => { - expect(getWords('camelCaseHTTPRequest🚀')).toEqual(['camel', 'Case', 'HTTP', 'Request', '🚀']); + expect(words('camelCaseHTTPRequest🚀')).toEqual(['camel', 'Case', 'HTTP', 'Request', '🚀']); }); it('should match accented letters', () => { - expect(getWords('Lunedì 18 Set')).toEqual(['Lunedì', '18', 'Set']); + expect(words('Lunedì 18 Set')).toEqual(['Lunedì', '18', 'Set']); }); }); diff --git a/src/string/_internal/getWords.ts b/src/string/words.ts similarity index 62% rename from src/string/_internal/getWords.ts rename to src/string/words.ts index dc12ec787..e4328b6e3 100644 --- a/src/string/_internal/getWords.ts +++ b/src/string/words.ts @@ -15,9 +15,26 @@ * const matches = 'camelCaseHTTPRequest🚀'.match(CASE_SPLIT_PATTERN); * // matches: ['camel', 'Case', 'HTTP', 'Request', '🚀'] */ -const CASE_SPLIT_PATTERN = +export const CASE_SPLIT_PATTERN = /\p{Lu}?\p{Ll}+|[0-9]+|\p{Lu}+(?!\p{Ll})|\p{Emoji_Presentation}|\p{Extended_Pictographic}|\p{L}+/gu; -export function getWords(str: string): string[] { +/** + * Splits `string` into an array of its words, treating spaces and punctuation marks as separators. + * + * @param {string} str The string to inspect. + * @param {RegExp | string} [pattern] The pattern to match words. + * @returns {string[]} Returns the words of `string`. + * + * @example + * words('fred, barney, & pebbles'); + * // => ['fred', 'barney', 'pebbles'] + * + * words('camelCaseHTTPRequest🚀'); + * // => ['camel', 'Case', 'HTTP', 'Request', '🚀'] + * + * words('Lunedì 18 Set') + * // => ['Lunedì', '18', 'Set'] + */ +export function words(str: string): string[] { return Array.from(str.match(CASE_SPLIT_PATTERN) ?? []); }