Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Talisman KB integration via selector id hints #131

Open
wants to merge 18 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "web-scraper-chrome-extension",
"version": "0.4.8",
"version": "0.4.9",
"description": "Web data extraction tool implemented as chrome extension",
"scripts": {
"lint": "eslint --ext .js src",
Expand Down
10 changes: 9 additions & 1 deletion src/_locales/en/messages.json
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@
"selector_edit_type_of_html_outer": { "message": "Outer HTML" },
"selector_edit_type_of_html_inner": { "message": "Inner HTML" },
"selector_edit_merge_into_list": { "message": "Merge collected items into list" },
"selector_edit_dont_flatten": { "message": "Dont flatten results" },
"sitemap_scrape_config_request_interval_label": { "message": "Request interval (ms)" },
"sitemap_scrape_config_request_interval_randomness_label": {
"message": "Request interval randomness (ms)"
Expand Down Expand Up @@ -343,5 +344,12 @@
},
"popup_ws_version": {
"message": "Web Scraper version: "
}
},

"link_types_for_concept_type": { "message": "Link types for concept type" },
"concept_types_for_link_type": { "message": "Concept types for link type" },
"prop_types_for_concept_type": { "message": "Property types for concept type" },
"prop_types_for_link_type": { "message": "Property types for link type" },
"all_concept_types": { "message": "All concept types" },
"incompatible_kb_parent_type": { "message": "There are incompatible parent KB types" }
}
10 changes: 10 additions & 0 deletions src/_locales/ru/messages.json
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@
"selector_edit_type_of_html_outer": { "message": "Внешний HTML" },
"selector_edit_type_of_html_inner": { "message": "Внутренний HTML" },
"selector_edit_merge_into_list": { "message": "Объединить собранные элементы в список" },
"selector_edit_dont_flatten": { "message": "Не приводить данные к плоскому виду" },

"sitemap_scrape_config_requestInterval": { "message": "Интервал между запросами" },
"sitemap_scrape_config_requestIntervalRandomness": { "message": "Случайность между запросами" },
Expand Down Expand Up @@ -379,5 +380,14 @@
},
"popup_ws_version": {
"message": "Версия Web Scraper: "
},

"link_types_for_concept_type": { "message": "Типы связей для типа концепта" },
"concept_types_for_link_type": { "message": "Типы концептов для типа связи" },
"prop_types_for_concept_type": { "message": "Типы характеристик для типа концепта" },
"prop_types_for_link_type": { "message": "Типы характеристик для типа связи" },
"all_concept_types": { "message": "Все типы концептов" },
"incompatible_kb_parent_type": {
"message": "Среди селекторов-предков есть несовместимые типы БЗ"
}
}
12 changes: 12 additions & 0 deletions src/background/background.js
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,18 @@ browser.runtime.onMessage.addListener(async request => {
return store.getSitemapData(Sitemap.sitemapFromObj(request.sitemap));
}

if (request.listAllConceptTypes) {
return store.listAllConceptTypes();
}

if (request.getConceptType) {
return store.getConceptType(request.id);
}

if (request.getLinkType) {
return store.getLinkType(request.id);
}

if (request.scrapeSitemap) {
const sitemap = Sitemap.sitemapFromObj(request.sitemap);
const queue = new Queue();
Expand Down
18 changes: 18 additions & 0 deletions src/devtools/views/SelectorEdit.html
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
</select>
</div>
</div>

<div class="form-group">
<input type="hidden" name="uuid" value="{{selector.uuid}}" disabled />
</div>
Expand Down Expand Up @@ -562,6 +563,23 @@
</div>
</div>

<div class="form-group feature feature-dontFlatten">
<label
for="dontFlatten"
class="col-sm-2 control-label"
data-i18n="selector_edit_dont_flatten"
></label>

<div class="col-sm-8">
<div class="checkbox">
<label>
<input type="checkbox" id="dontFlatten" name="dontFlatten"
{{#selector.dontFlatten}}checked="checked" {{/selector.dontFlatten}}/>
</label>
</div>
</div>
</div>

<div class="form-group feature feature-delay">
<label for="delay" class="col-sm-2 control-label" data-i18n="selector_edit_delay"></label>

Expand Down
89 changes: 68 additions & 21 deletions src/scripts/Controller.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import SelectorList from './SelectorList';
import SelectorTable from './Selector/SelectorTable';
import Model from './Model';
import Translator from './Translator';
import TalismanKB from './TalismanKB';

export default class SitemapController {
constructor(store, templateDir) {
Expand Down Expand Up @@ -89,6 +90,11 @@ export default class SitemapController {
return value;
})
.set_sort_objects(true);

if (store.storageType === 'StoreTalismanApi') {
this.kb = new TalismanKB(store);
}

return this.init();
}

Expand All @@ -101,10 +107,11 @@ export default class SitemapController {
event,
selector,
(function (selector, event) {
return function () {
return function (...args) {
const continueBubbling = controls[selector][event].call(
controller,
this
this,
...args
);
if (continueBubbling !== true) {
return false;
Expand Down Expand Up @@ -272,6 +279,7 @@ export default class SitemapController {
},
'#edit-selector #selectorId': {
'change:flexdatalist': this.updateCurrentlyEditedSelectorInParentsList,
'select:flexdatalist': this.selectorIdHintSelected,
},
'#selector-tree button[action=add-selector]': {
click: this.addSelector,
Expand Down Expand Up @@ -1193,6 +1201,10 @@ export default class SitemapController {
}
}

if (this.kb) {
return this.kb.validateParentSelectors(newSelector, sitemap);
}

return true;
}.bind(this),
},
Expand All @@ -1202,9 +1214,9 @@ export default class SitemapController {
});
}

editSelector(button) {
async editSelector(button) {
const selector = $(button).closest('tr').data('selector');
this._editSelector(selector);
await this._editSelector(selector);
}

updateCurrentlyEditedSelectorInParentsList() {
Expand All @@ -1215,7 +1227,7 @@ export default class SitemapController {
$('.currently-edited').val(selector.uuid).text(`${selectorId} - ${selector.uuid}`);
}

_editSelector(selector) {
async _editSelector(selector) {
const sitemap = this.state.currentSitemap;
const selectorIds = sitemap.getPossibleParentSelectorIds();

Expand All @@ -1227,19 +1239,6 @@ export default class SitemapController {
});
$('#viewport').html($editSelectorForm);

$('#selectorId').flexdatalist({
init: this.initSelectorValidation(),
textProperty: '{fieldName}',
valueProperty: 'fieldName',
data: [...sitemap.model, { entity: '', field: '', fieldName: selector.id }],
searchIn: ['entity', 'field'],
visibleProperties: ['entity', 'field'],
groupBy: 'entity',
searchContain: true,
noResultsText: '',
minLength: 1,
});

// mark initially opened selector as currently edited
$('#edit-selector #parentSelectors option').each((_, element) => {
if ($(element).val() === selector.uuid) {
Expand Down Expand Up @@ -1267,12 +1266,14 @@ export default class SitemapController {
.attr('selected', 'selected');
});

this.initSelectorValidation();

this.state.currentSelector = selector;
this.selectorTypeChanged(false);
await this.selectorTypeChanged(false);
Translator.translatePage();
}

selectorTypeChanged(changeTrigger) {
async selectorTypeChanged(changeTrigger) {
// add this selector to possible parent selector
const selector = this.getCurrentlyEditedSelector();
const features = selector.getFeatures();
Expand All @@ -1296,6 +1297,48 @@ export default class SitemapController {
else {
$('#edit-selector #parentSelectors .currently-edited').remove();
}

await this.initSelectorIdHints(selector);
}

async initSelectorIdHints(selector) {
const $selectorIdInput = $('#selectorId');
$selectorIdInput.flexdatalist();
const idDatalistOptions = {
textProperty: '{fieldName}',
valueProperty: 'fieldName',
searchIn: ['entity', 'field'],
visibleProperties: ['entity', 'field'],
groupBy: 'entity',
searchContain: true,
selectionRequired: false,
noResultsText: '',
minLength: 1,
};

const hints = [];
if (this.kb) {
const kbHints = await this.kb.generateIdHints(selector, this.state.currentSitemap);
hints.push(...kbHints);
} else {
hints.push(...this.state.currentSitemap.model);
}
hints.push({
entity: '',
field: '',
fieldName: selector.id,
});

$selectorIdInput.flexdatalist({
...idDatalistOptions,
data: hints,
});
}

selectorIdHintSelected(selectorIdInput, event, item) {
if (item.kbHint) {
$('#edit-selector [name=dontFlatten]').prop('checked', true);
}
}

async saveSelector(button) {
Expand Down Expand Up @@ -1353,6 +1396,7 @@ export default class SitemapController {
const delay = $('#edit-selector [name=delay]').val();
const outerHTML = $('#edit-selector [id=outerHTML]').is(':checked');
const mergeIntoList = $('#edit-selector [name=mergeIntoList]').is(':checked');
const dontFlatten = $('#edit-selector [name=dontFlatten]').is(':checked');
const extractAttribute = $('#edit-selector [name=extractAttribute]').val();
const extractStyle = $('#edit-selector [name=extractStyle]').val();
const value = $('#edit-selector [name=value]').val();
Expand All @@ -1376,6 +1420,7 @@ export default class SitemapController {
regexgroup: $('#edit-selector [name=regexgroup]').val(),
};
const uuid = $('#edit-selector [name=uuid]').val();
const conceptTypeId = $('#edit-selector [name=conceptTypeId]').val();

$columnHeaders.each(function (i) {
const header = $($columnHeaders[i]).val();
Expand All @@ -1388,7 +1433,7 @@ export default class SitemapController {
});
});

let options = {
const options = {
id,
selector: selectorsSelector,
tableHeaderRowSelector,
Expand All @@ -1414,8 +1459,10 @@ export default class SitemapController {
textmanipulation,
stringReplacement,
mergeIntoList,
dontFlatten,
outerHTML,
uuid,
conceptTypeId,
};

return SelectorList.createSelector(options);
Expand Down
17 changes: 14 additions & 3 deletions src/scripts/DataExtractor.js
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,11 @@ export default class DataExtractor {
newParentElement
);
deferredChildCommonData.done(function (data) {
d.resolve(data);
if (selector.dontFlatten) {
d.resolve({ [selector.id]: data });
} else {
d.resolve(data);
}
});
}
} else {
Expand Down Expand Up @@ -214,7 +218,9 @@ export default class DataExtractor {

selectorData.forEach(
function (element) {
const newCommonData = Object.clone(commonData, true);
const newCommonData = selector.dontFlatten
? {}
: Object.clone(commonData, true);
const childRecordDeferredCall = this.getSelectorTreeData.bind(
this,
selectors,
Expand All @@ -231,7 +237,12 @@ export default class DataExtractor {
responses.forEach(function (childRecordList) {
childRecordList.forEach(function (childRecord) {
const rec = {};
Object.merge(rec, childRecord, true);
if (selector.dontFlatten) {
Object.merge(rec, commonData, true);
Object.merge(rec, { [selector.id]: childRecord }, true);
} else {
Object.merge(rec, childRecord, true);
}
resultData.push(rec);
});
});
Expand Down
29 changes: 17 additions & 12 deletions src/scripts/Job.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
export default class Job {
constructor(url, parentSelector, scraper, parentJob, baseData) {
constructor(url, parentSelector, scraper, parentJob, baseData, baseDataPath) {
if (parentJob !== undefined) {
this.url = this.combineUrls(parentJob.url, url);
} else {
Expand All @@ -9,6 +9,7 @@ export default class Job {
this.scraper = scraper;
this.dataItems = [];
this.baseData = baseData || {};
this.baseDataPath = baseDataPath || [];
}

combineUrls(parentUrl, childUrl) {
Expand Down Expand Up @@ -69,17 +70,12 @@ export default class Job {
this.url,
sitemap,
this.parentSelector,
function (results) {
results => {
// merge data with data from initialization
for (const i in results) {
const result = results[i];
for (const key in this.baseData) {
if (!(key in result)) {
result[key] = this.baseData[key];
}
}
this.dataItems.push(result);
}
results.forEach(result => {
const mergedResult = this.mergeWithBaseData(result);
this.dataItems.push(mergedResult);
});

if (sitemap) {
// table selector can dynamically add columns (addMissingColumns Feature)
Expand All @@ -88,11 +84,20 @@ export default class Job {

console.log(job);
callback(job);
}.bind(this),
},
this
);
}

mergeWithBaseData(result) {
const mergedData = structuredClone(this.baseData);
const { _url, _timestamp, ...resultData } = result;
const insertAt = this.baseDataPath.reduce((data, key) => data[key] || {}, mergedData);
Object.assign(mergedData, { _url, _timestamp });
Object.assign(insertAt, resultData);
return mergedData;
}

getResults() {
return this.dataItems;
}
Expand Down
Loading