Skip to content

Commit

Permalink
feat(helper): add getPageText (#39)
Browse files Browse the repository at this point in the history
  • Loading branch information
Mister-Hope authored Jan 30, 2024
1 parent f99d448 commit d666d1b
Show file tree
Hide file tree
Showing 7 changed files with 818 additions and 18 deletions.
1 change: 1 addition & 0 deletions tools/helper/src/node/page/index.ts
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
export * from './excerpt.js'
export * from './text.js'
159 changes: 159 additions & 0 deletions tools/helper/src/node/page/text.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
// eslint-disable-next-line vue/prefer-import-from-vue
import { isHTMLTag } from '@vue/shared'
import type { AnyNode } from 'cheerio'
import { load } from 'cheerio'
import type { App, Page } from 'vuepress/core'
import {} from 'vuepress/shared'
import { isArray } from '../../shared/index.js'

const MEDIA_WITH_ALT = ['img']

const REMOVED_TAGS = [
// non content
'title',
'base',
'meta',
'template',
'script',
'style',
'canvas',
'slot',

// not main content
'nav',
'aside',
'footer',

// deleted
'del',
's',

// rich media
'audio',
'video',
'canvas',
'iframe',
'map',
'area',
'track',
'object',

// input
'input',
'textarea',
'select',
'option',
'optgroup',
'datalist',
]

export interface PageTextOptions {
/**
* Whether convert text to single line content
*
* 是否将文字转换成单行内容
*
* @default false
*/
singleLine?: boolean

/**
* Length of text
*
* @description Text length will be the minimal possible length reaching this value
*
* 文字的长度
*
* @description 文字的长度会尽可能的接近这个值
*
* @default 300
*/
length?: number

/**
* Tags to be removed
*
* @description Table and code blocks are removed by default.
*
* 需要移除的标签
*
* @description 默认情况下表格和代码块会被移除
*
* @default ['table', 'pre']
*/
removedTags?: string[]
}

interface NodeOptions {
base: string
removedTags: string[]
}

const handleNode = (
node: AnyNode,
{ base, removedTags }: NodeOptions,
): string => {
if (node.type === 'tag') {
// toc should be dropped
if (
[node.attribs.class, node.attribs.id].some((item) =>
['table-of-contents', 'toc'].includes(item),
)
)
return ''

// return alt text
if (MEDIA_WITH_ALT.includes(node.tagName)) {
return node.attribs.alt || ''
}

// html tags can be returned
if (
!REMOVED_TAGS.includes(node.tagName) &&
!removedTags.includes(node.tagName) &&
isHTMLTag(node.tagName)
) {
return handleNodes(node.children, { base, removedTags })
}

return ''
}

if (node.type === 'text') return node.data

return ''
}

const handleNodes = (
nodes: AnyNode[] | null,
{ base, removedTags }: NodeOptions,
): string =>
isArray(nodes)
? nodes.map((node) => handleNode(node, { base, removedTags })).join('')
: ''

const $ = load('')

export const getPageText = (
{ options: { base } }: App,
{ contentRendered }: Page,
{
length = 300,
singleLine,
removedTags = ['table', 'pre'],
}: PageTextOptions = {},
): string => {
let result = ''
const rootNodes = $.parseHTML(contentRendered) || []

for (const node of rootNodes) {
const text = handleNode(node, { base, removedTags })

if (text) {
result += text
if (text.length >= length) break
}
}

return singleLine ? result.replace(/\n/g, ' ').replace(/\s+/g, ' ') : result
}
2 changes: 1 addition & 1 deletion tools/helper/tests/__fixtures__/src/markdown.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ Classic:
| center | right | left |
| :------------------------: | -----------------------: | :---------------------- |
| For center align use `:-:` | For right align use `-:` | For left align use `:-` |
| b | aaaaaaaaa | aaaa |
| table text | aaaaaaaaa | aaaa |
| c | aaaa | a |

## Codes
Expand Down
32 changes: 16 additions & 16 deletions tools/helper/tests/node/__snapshots__/excerpt.spec.ts.snap
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html

exports[`generate page excerpt correctly > default > generate excerpt for all pages > / 1`] = `
exports[`getPageExcerpt > default > generate excerpt for all pages > / 1`] = `
"<p>Homepage Content</p>
"
`;

exports[`generate page excerpt correctly > default > generate excerpt for all pages > /component.html 1`] = `
exports[`getPageExcerpt > default > generate excerpt for all pages > /component.html 1`] = `
"<p>A text with </p>
"
`;

exports[`generate page excerpt correctly > default > generate excerpt for all pages > /custom-separator.html 1`] = `
exports[`getPageExcerpt > default > generate excerpt for all pages > /custom-separator.html 1`] = `
"<p>Here is <strong>article excerpt</strong>.</p>
<div class="language-javascript" data-ext="js" data-title="js"><pre class="language-javascript"><code>const a = 1;
</code></pre></div><p>END_OF_EXCERPT</p>
Expand All @@ -23,12 +23,12 @@ exports[`generate page excerpt correctly > default > generate excerpt for all pa
</ol>"
`;

exports[`generate page excerpt correctly > default > generate excerpt for all pages > /long-content.html 1`] = `
exports[`getPageExcerpt > default > generate excerpt for all pages > /long-content.html 1`] = `
"<p>Lorem ipsum dolor sit amet consectetur adipisicing elit. Nemo, rem. Recusandae itaque rem, non perspiciatis magnam molestiae, in ab quae, dolore illo neque possimus nisi inventore? Laudantium officia nihil iusto!</p>
<p>Quod delectus vero dicta perferendis quia sint incidunt vitae pariatur, nemo suscipit saepe veritatis laudantium deleniti aperiam! Explicabo dignissimos, corporis, odio voluptatum eius aut pariatur dicta inventore, amet optio modi!</p>"
`;

exports[`generate page excerpt correctly > default > generate excerpt for all pages > /markdown.html 1`] = `
exports[`getPageExcerpt > default > generate excerpt for all pages > /markdown.html 1`] = `
"
<h2>Headings</h2>
<h1>Heading 1</h1>
Expand All @@ -42,23 +42,23 @@ exports[`generate page excerpt correctly > default > generate excerpt for all pa
<p>This sentence has <strong>bold</strong>、<em>italic</em> and <s>delete</s> style text.</p>"
`;

exports[`generate page excerpt correctly > default > generate excerpt for all pages > /separator.html 1`] = `
exports[`getPageExcerpt > default > generate excerpt for all pages > /separator.html 1`] = `
"<p>Here is <strong>article excerpt</strong>.</p>
<div class="language-javascript" data-ext="js" data-title="js"><pre class="language-javascript"><code>const a = 1;
</code></pre></div>"
`;

exports[`generate page excerpt correctly > excerptLength > extract all content with Infinity > / 1`] = `
exports[`getPageExcerpt > excerptLength > extract all content with Infinity > / 1`] = `
"<p>Homepage Content</p>
"
`;

exports[`generate page excerpt correctly > excerptLength > extract all content with Infinity > /component.html 1`] = `
exports[`getPageExcerpt > excerptLength > extract all content with Infinity > /component.html 1`] = `
"<p>A text with </p>
"
`;

exports[`generate page excerpt correctly > excerptLength > extract all content with Infinity > /custom-separator.html 1`] = `
exports[`getPageExcerpt > excerptLength > extract all content with Infinity > /custom-separator.html 1`] = `
"<p>Here is <strong>article excerpt</strong>.</p>
<div class="language-javascript" data-ext="js" data-title="js"><pre class="language-javascript"><code>const a = 1;
</code></pre></div><p>END_OF_EXCERPT</p>
Expand All @@ -73,7 +73,7 @@ exports[`generate page excerpt correctly > excerptLength > extract all content w
</code></pre></div>"
`;

exports[`generate page excerpt correctly > excerptLength > extract all content with Infinity > /long-content.html 1`] = `
exports[`getPageExcerpt > excerptLength > extract all content with Infinity > /long-content.html 1`] = `
"<p>Lorem ipsum dolor sit amet consectetur adipisicing elit. Nemo, rem. Recusandae itaque rem, non perspiciatis magnam molestiae, in ab quae, dolore illo neque possimus nisi inventore? Laudantium officia nihil iusto!</p>
<p>Quod delectus vero dicta perferendis quia sint incidunt vitae pariatur, nemo suscipit saepe veritatis laudantium deleniti aperiam! Explicabo dignissimos, corporis, odio voluptatum eius aut pariatur dicta inventore, amet optio modi!</p>
<p>Eligendi voluptatum animi aspernatur rerum hic quasi neque doloribus mollitia quaerat. Id blanditiis, nostrum autem sequi vero praesentium magnam totam in sit animi velit, veritatis hic natus? Quidem, distinctio provident!</p>
Expand Down Expand Up @@ -128,7 +128,7 @@ exports[`generate page excerpt correctly > excerptLength > extract all content w
"
`;

exports[`generate page excerpt correctly > excerptLength > extract all content with Infinity > /markdown.html 1`] = `
exports[`getPageExcerpt > excerptLength > extract all content with Infinity > /markdown.html 1`] = `
"
<h2>Headings</h2>
<h1>Heading 1</h1>
Expand Down Expand Up @@ -231,7 +231,7 @@ line break again</li>
<td style="text-align:left">For left align use <code>:-</code></td>
</tr>
<tr>
<td style="text-align:center">b</td>
<td style="text-align:center">table text</td>
<td style="text-align:right">aaaaaaaaa</td>
<td style="text-align:left">aaaa</td>
</tr>
Expand All @@ -257,25 +257,25 @@ console.log(foo(5));
"
`;
exports[`generate page excerpt correctly > excerptLength > extract all content with Infinity > /separator.html 1`] = `
exports[`getPageExcerpt > excerptLength > extract all content with Infinity > /separator.html 1`] = `
"<p>Here is <strong>article excerpt</strong>.</p>
<div class="language-javascript" data-ext="js" data-title="js"><pre class="language-javascript"><code>const a = 1;
</code></pre></div>"
`;
exports[`generate page excerpt correctly > excerptLength > only generate when having marker with 0 > /separator.html 1`] = `
exports[`getPageExcerpt > excerptLength > only generate when having marker with 0 > /separator.html 1`] = `
"<p>Here is <strong>article excerpt</strong>.</p>
<div class="language-javascript" data-ext="js" data-title="js"><pre class="language-javascript"><code>const a = 1;
</code></pre></div>"
`;
exports[`generate page excerpt correctly > excerptSeparator > generate excerpt with custom marker > /custom-separator.html 1`] = `
exports[`getPageExcerpt > excerptSeparator > generate excerpt with custom marker > /custom-separator.html 1`] = `
"<p>Here is <strong>article excerpt</strong>.</p>
<div class="language-javascript" data-ext="js" data-title="js"><pre class="language-javascript"><code>const a = 1;
</code></pre></div>"
`;
exports[`generate page excerpt correctly > excerptSeparator > generate excerpt with default marker > /separator.html 1`] = `
exports[`getPageExcerpt > excerptSeparator > generate excerpt with default marker > /separator.html 1`] = `
"<p>Here is <strong>article excerpt</strong>.</p>
<div class="language-javascript" data-ext="js" data-title="js"><pre class="language-javascript"><code>const a = 1;
</code></pre></div>"
Expand Down
Loading

0 comments on commit d666d1b

Please sign in to comment.