Scalar_Backup

<?xml version="1.0" encoding="UTF-8"?>
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:scalar="http://scalar.usc.edu/2012/01/scalar-ns#" xmlns:prov="http://www.w3.org/ns/prov#" xmlns:dcterms="http://purl.org/dc/terms/" xmlns:ov="http://open.vocab.org/terms/" xmlns:sioc="http://rdfs.org/sioc/ns#" xmlns:oac="http://www.openannotation.org/ns/" xmlns:art="http://simile.mit.edu/2003/10/ontologies/artstor#">

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/advanced-options">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-08-16T10:37:11+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:316238"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/advanced-options.1"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/advanced-options.1"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/advanced-options.1">
    <ov:versionnumber>1</ov:versionnumber>
    <dcterms:title>Advanced Options</dcterms:title>
    <dcterms:description>Manual page for the Lexos Tokenize and Analyze Advanced Options</dcterms:description>
    <sioc:content>&lt;h4&gt;&lt;u&gt;Tokenize&lt;/u&gt;&lt;/h4&gt;By default Lexos splits strings of text into tokens every time it encounters a space character. For Western languages, this means that each token generally corresponds to a word. Click the &lt;strong&gt;by Characters&lt;/strong&gt; radio button to treat every character as a separate token. If you wish to use n-grams, increase the &lt;strong&gt;1-gram&lt;/strong&gt; incrementer to 2, 3, 4, etc. For example, &amp;quot;the dog ran&amp;quot; would produce the 1-gram tokens &lt;em&gt;the&lt;/em&gt;, &lt;em&gt;dog&lt;/em&gt;, &lt;em&gt;ran&lt;/em&gt;., the 2-grams &lt;em&gt;the dog&lt;/em&gt;, &lt;em&gt;dog ran&lt;/em&gt;, and so on. 2-grams tokenized by characters would begin &lt;em&gt;th&lt;/em&gt;, &lt;em&gt;he&lt;/em&gt;, &lt;em&gt;e&amp;nbsp;&lt;/em&gt;, and so on.&lt;br /&gt;&lt;br /&gt;Note that increasing the n-gram size my produce a larger DTM, and the table will thus take longer to build.&lt;h4&gt;&lt;u&gt;Culling Options&lt;/u&gt;&lt;/h4&gt;&amp;quot;Culling Options&amp;quot; is a generic term we use for methods of decreasing the number of terms used to generate the DTM based on statistical criteria (as opposed to something like applying a stopword list in &lt;strong&gt;Scrubber&lt;/strong&gt;). Lexos offer three different methods:&lt;br /&gt;&lt;br /&gt;1. &lt;strong&gt;Most Frequent Words&lt;/strong&gt;: This method takes a slice of the DTM containing only the top N most frequently occurring terms. The default setting is 100.&lt;br /&gt;2. &lt;strong&gt;Culling&lt;/strong&gt;: This method builds the DTM using only terms that occur in at least N documents. The default setting is 1.&lt;br /&gt;3. &lt;strong&gt;Greywords&lt;/strong&gt;: This method removes from the DTM those terms occurring in particularly low frequencies. Lexos calculates the cut-off point based on the average length of your documents.&lt;h4&gt;&lt;u&gt;Normalize&lt;/u&gt;&lt;/h4&gt;By default, Lexos displays the frequency of the occurrence of terms in your documents as a proportion of the entire text. If you wish to see the actual number of occurrences, click the &lt;strong&gt;Raw Counts&lt;/strong&gt; radio button. You may also attempt to take into account differences in the lengths of your documents by calculating their &lt;a target="_blank" href="https://en.wikipedia.org/wiki/Tf%E2%80%93idf"&gt;Term Frequency-Inverse Document Frequency (TF-IDF)&lt;/a&gt;. Lexos offers three different methods of calculating TF-IDF based on &lt;strong&gt;Euclidean Distance&lt;/strong&gt;, &lt;strong&gt;Manhattan Distance&lt;/strong&gt;, or without using a distance metric (&lt;strong&gt;Norm: None&lt;/strong&gt;). For further discussion on these options, see the topics article on [TF-IDF](http://scalar.usc.edu/works/lexos/tf-idf).&lt;h4&gt;&lt;u&gt;Assign Temporary Labels&lt;/u&gt;&lt;/h4&gt;Lexos automatically uses the label in the &amp;quot;Document Name&amp;quot; column in the &lt;strong&gt;Manage&lt;/strong&gt; tool as the document label. However, you may change the label used in your table by entering a new value for it in the forms displayed in &lt;strong&gt;Assign Temporary Labels&lt;/strong&gt;. This is particularly useful if you want to save different labels when you download your DTM. Keep in mind that whatever labels you set will be applied in all other Lexos tools that use the Advanced Options. However, the original document name in &lt;strong&gt;Manage&lt;/strong&gt; will not be affected. After assigning temporary labels in &lt;strong&gt;Tokenizer&lt;/strong&gt;, click the &lt;strong&gt;Regenerate Table&lt;/strong&gt; button to rebuild the table with the new labels.</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-08-16T10:37:11+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:834672"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/advanced-options"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/ajaxtest">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <scalar:customScript>$(document).ready(function(){
    $("#github-content").load("https://raw.githubusercontent.com/WheatonCS/Lexos/master/0_InstallGuides/Windows/README.md", function(markdown){
      var md = new Remarkable('commonmark');
      var html = md.render(markdown);
      $("#github-content").html(html).show();
    });
  });</scalar:customScript>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2017-07-08T14:33:47+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:477723"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/ajaxtest.7"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/ajaxtest.7"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/ajaxtest.7">
    <ov:versionnumber>7</ov:versionnumber>
    <dcterms:title>Content from GitHub</dcterms:title>
    <sioc:content>&lt;p&gt;Material below the horizontal rule is fetched from the Lexos GitHub repository using Ajax. This avoids Scalar&amp;#39;s clunky editor (you can edit offline) and provides better version control. The downside is that you don&amp;#39;t get Scalar&amp;#39;s embedding markup (I&amp;#39;m not sure how necessary it actually is). Although I&amp;#39;ve used Ajax, we could also grab the material from GitHub&amp;#39;s API. But this might actually be more complicated.&lt;/p&gt;&lt;p&gt;Since the original document on GitHub is in Markdown, I have had to use a script to convert it to HTML. I&amp;#39;ve used &lt;a target="_blank" href="https://github.com/jonschlinkert/remarkable"&gt;Remarkable&lt;/a&gt;, which seems to work pretty well.&lt;/p&gt;&lt;hr /&gt;&lt;script src="https://cdn.jsdelivr.net/remarkable/1.7.1/remarkable.min.js"&gt;&lt;br /&gt;  &lt;br /&gt;  &lt;br /&gt;  &lt;br /&gt;&lt;/script&gt;&lt;div id="github-content" style="display:none;"&gt;&amp;nbsp;&lt;/div&gt;</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2017-07-08T15:44:50+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:1257687"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/ajaxtest"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/bibliography">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/6902"/>
    <dcterms:created>2015-06-09T06:58:09+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:160478"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/bibliography.8"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/bibliography.8"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/bibliography.8">
    <ov:versionnumber>8</ov:versionnumber>
    <dcterms:title>Bibliography</dcterms:title>
    <dcterms:description>Beginning of bibliography path</dcterms:description>
    <sioc:content>&lt;p&gt;We are working on our bibliography. In the meantime, check out the Zotero bibliographies listed below.&lt;/p&gt;&lt;!--&lt;br /&gt;undefined&lt;iframe src="http://bibbase.org/show?bib=https%3A%2F%2Fapi.zotero.org%2Fgroups%2F47671%2Fitems%3Fkey%3DrjMBGMVIUHoqwuGZRNI4HppO%26format%3Dbibtex%26limit%3D100" style="min-height:600px;" width="100%"&gt;&lt;br /&gt;&lt;/iframe&gt;&lt;br /&gt;--&gt;</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2017-04-04T14:59:08+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:1105242"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/bibliography"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="urn:scalar:path:1105242:439018:1">
    <scalar:urn rdf:resource="urn:scalar:path:1105242:439018:1"/>
    <oac:hasBody rdf:resource="http://scalar.usc.edu/works/lexos/bibliography.8"/>
    <oac:hasTarget rdf:resource="http://scalar.usc.edu/works/lexos/dariah-bibliography.6#index=1"/>
    <rdf:type rdf:resource="http://www.openannotation.org/ns/Annotation"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/dariah-bibliography">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2015-08-29T17:40:26+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:176729"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/dariah-bibliography.6"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/dariah-bibliography.6"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/dariah-bibliography.6">
    <ov:versionnumber>6</ov:versionnumber>
    <dcterms:title>DARIAH Bibliography</dcterms:title>
    <dcterms:description>Zotero bibliography for Digital Humanities maintained by the DARIAH collaborative</dcterms:description>
    <sioc:content>&lt;iframe src="https://www.zotero.org/groups/doing_digital_humanities_-_a_dariah_bibliography/items" style="height:800px;width:100%;border:1px solid #000;"&gt;&lt;/iframe&gt;</sioc:content>
    <scalar:defaultView>blank</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2015-08-29T17:47:52+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:439018"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/dariah-bibliography"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="urn:scalar:path:1105242:1105215:2">
    <scalar:urn rdf:resource="urn:scalar:path:1105242:1105215:2"/>
    <oac:hasBody rdf:resource="http://scalar.usc.edu/works/lexos/bibliography.8"/>
    <oac:hasTarget rdf:resource="http://scalar.usc.edu/works/lexos/stylometry-bibliography.1#index=2"/>
    <rdf:type rdf:resource="http://www.openannotation.org/ns/Annotation"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/stylometry-bibliography">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2017-04-04T14:56:43+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:416203"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/stylometry-bibliography.1"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/stylometry-bibliography.1"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/stylometry-bibliography.1">
    <ov:versionnumber>1</ov:versionnumber>
    <dcterms:title>Stylometry Bibliography</dcterms:title>
    <dcterms:description>Stylometry bibliography generated from DH 2016 conference</dcterms:description>
    <sioc:content>&lt;iframe src="https://www.zotero.org/groups/stylometry_bibliography/items/order/year/sort/desc" style="height:800px;width:100%;border:1px solid #000;"&gt;&lt;br /&gt;&lt;/iframe&gt;</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2017-04-04T14:56:43+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:1105215"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/stylometry-bibliography"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/bubbleviz">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/6902"/>
    <dcterms:created>2015-06-04T10:08:28+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:159681"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/bubbleviz.4"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/bubbleviz.4"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/bubbleviz.4">
    <ov:versionnumber>4</ov:versionnumber>
    <dcterms:title>The BubbleViz Tool</dcterms:title>
    <dcterms:description>Manual page for the Lexos BubbleViz tool</dcterms:description>
    <sioc:content>&lt;strong&gt;BubbleViz&lt;/strong&gt; offers an alternative to word clouds as a method of visualizing the &lt;strong&gt;Document-Term Matrix&lt;/strong&gt;. They present terms arranged inside circles (&amp;quot;bubbles&amp;quot;) sized according to the terms&amp;#39; frequency within the text. &lt;strong&gt;BubbleViz&lt;/strong&gt; graphs enable you to get a sense of the content in your corpus, and they are very good for presentations. To generate a &lt;strong&gt;BubbleViz&lt;/strong&gt;, select some or all of your active documents using the &lt;strong&gt;Select Document(s)&lt;/strong&gt; check boxes. The Lexos &lt;strong&gt;BubbleViz&lt;/strong&gt; tool allows you to control the &lt;strong&gt;Graph Size&lt;/strong&gt; (in pixels) and to filter the &lt;strong&gt;Maximum Number of Terms&lt;/strong&gt;. You can also set a &lt;strong&gt;Minimum Term Length&lt;/strong&gt;: the minimum number of characters required in a term for it to be added to the graph. Once you have chosen your options, click the &lt;strong&gt;Get Graph&lt;/strong&gt; button to generate the graph. If you then wish to download the graph as a PNG file, click the &lt;strong&gt;Save as PNG&lt;/strong&gt; button.&lt;br /&gt;&lt;br /&gt;Run your mouse cursor over the bubble to open a tooltip showing the number of times the term occurs in you selected document(s).</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <scalar:continue_to_content_id>159681</scalar:continue_to_content_id>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-08-16T20:44:10+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:834855"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/bubbleviz"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/choosing-a-distance-metric">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2015-08-12T00:20:24+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:172451"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/choosing-a-distance-metric.5"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/choosing-a-distance-metric.5"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/choosing-a-distance-metric.5">
    <ov:versionnumber>5</ov:versionnumber>
    <dcterms:title>Choosing a Distance Metric</dcterms:title>
    <dcterms:description>More detailed discussion of distance metrics</dcterms:description>
    <sioc:content>In hierarchical clustering, a distance metric must be chosen before running the algorithm for merging documents into clusters. K-means clustering uses standard Euclidean distance to determine the distance from the cluster centroid, but this or other distance measures can be used to evaluate the cluster quality. (In Lexos, this is done through the Silhouette Score, which can be calculated using multiple distance metrics.)

A few general observations have already been made under &lt;a style="z-index: 0;" href="http://scalar.usc.edu/works/lexos/glossary#edit-distance" data-display-content-preview-box="true"&gt;Cluster Analysis&lt;/a&gt;. The distance metric is essentially how you define the difference between your documents. The &lt;span class="annotator-hl"&gt;Euclidean distance&lt;/span&gt; metric measures the magnitude of the difference in distance between two document vectors (vectors of counts for each word in both documents). Non-Euclidean metrics like  &lt;a style="z-index: 0; position: relative;" href="http://scalar.usc.edu/works/lexos/glossary#cosine-similarity" data-display-content-preview-box="true"&gt;cosine similarity&lt;/a&gt;, which measures the angle between the vectors, can also be converted into measures of distance between clusters. Since document-term matrices are often sparse (they contain a lot of term counts of 0), cosine similarity may be a better option for clustering larger documents, and particularly if the documents are of uneven lengths. But the emphasis must be placed on &lt;i&gt;may&lt;/i&gt;. There are no hard and fast rules, although there is renewed attention to providing more nuanced help with the choice of metrics (Jannidis &lt;i&gt;et al.&lt;/i&gt;&amp;nbsp;2015, Eder 201?).

The circumstances under which certain distance metrics perform best, or even how to use machine learning to aid in the selection of such metrics, is the subject of ongoing research. However, much of it uses data very different from the type of material used in literary text analysis. Currently, our best advice is to be aware of how you are measuring distance and experiment with different linkage metrics, trying to explain how they operate on your texts. We provide a case study here which serves to introduce some of the most common metrics (all available in Lexos), and how they affect the results of a single data set.&lt;div&gt;
&lt;/div&gt;&lt;div&gt;&lt;a style="box-sizing: border-box; color: rgb(87, 62, 37); font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; font-size: 12.6000003814697px; line-height: 18.0000019073486px; background: 0px 0px rgb(255, 255, 255);"&gt;Eder, M. (201?). Visualization in stylometry: some problems and solutions. To be published in &lt;i&gt;Digital Scholarship in the Humanities&lt;/i&gt;.&lt;/a&gt;&lt;/div&gt;&lt;div&gt;&lt;a style="box-sizing: border-box; color: rgb(87, 62, 37); font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; font-size: 12.6000003814697px; line-height: 18.0000019073486px; background: 0px 0px rgb(255, 255, 255);"&gt;
&lt;/a&gt;&lt;/div&gt;&lt;div&gt;&lt;a style="box-sizing: border-box; color: rgb(87, 62, 37); font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; font-size: 12.6000003814697px; line-height: 18.0000019073486px; background: 0px 0px rgb(255, 255, 255);"&gt;Jannidis, F.,&lt;/a&gt;&lt;a style="box-sizing: border-box; color: rgb(87, 62, 37); font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; font-size: 12.6000003814697px; line-height: 18.0000019073486px; background: 0px 0px rgb(255, 255, 255);"&gt;&amp;nbsp;Pielström, S.,&lt;/a&gt;&lt;a style="box-sizing: border-box; color: rgb(87, 62, 37); font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; font-size: 12.6000003814697px; line-height: 18.0000019073486px; background: 0px 0px rgb(255, 255, 255);"&gt;&amp;nbsp;Schöch,&amp;nbsp;&lt;/a&gt;&lt;a style="color: rgb(87, 62, 37); box-sizing: border-box; font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; font-size: 12.6000003814697px; line-height: 18.0000019073486px; background: 0px 0px rgb(255, 255, 255);"&gt;C.,&lt;/a&gt;&lt;a style="box-sizing: border-box; color: rgb(87, 62, 37); font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; font-size: 12.6000003814697px; line-height: 18.0000019073486px; background: 0px 0px rgb(255, 255, 255);"&gt;&amp;nbsp;Vitt,&amp;nbsp;&lt;/a&gt;&lt;a style="color: rgb(87, 62, 37); box-sizing: border-box; font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; font-size: 12.6000003814697px; line-height: 18.0000019073486px; background: 0px 0px rgb(255, 255, 255);"&gt;T&lt;/a&gt;&lt;a style="color: rgb(87, 62, 37); box-sizing: border-box; font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; font-size: 12.6000003814697px; line-height: 18.0000019073486px; background: 0px 0px rgb(255, 255, 255);"&gt;. (2015). Improving Burrows' Delta -- An empirical evaluation of text distance measures. Presented at DH 2015 Global Digital Humanities, Sydney, Australia, July 3, 2015.&lt;/a&gt;&lt;/div&gt;</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3689"/>
    <dcterms:created>2015-08-13T15:45:22+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:431807"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/choosing-a-distance-metric"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/cluster-analysis">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2015-07-31T23:01:32+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:170618"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/cluster-analysis.35"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/cluster-analysis.35"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/cluster-analysis.35">
    <ov:versionnumber>35</ov:versionnumber>
    <dcterms:title>Cluster Analysis</dcterms:title>
    <dcterms:description>The start page for the cluster analysis topics path</dcterms:description>
    <sioc:content>&lt;em&gt;In order to use cluster analysis successfully to interpret literary texts, it is important to have a good understanding of how the process works.&lt;/em&gt;&lt;p&gt;Cluster analysis may be formally defined as an &lt;a href="http://scalar.usc.edu/works/lexos/glossary#unsupervised-learning"&gt;unsupervised learning&lt;/a&gt; technique for finding &amp;ldquo;natural&amp;rdquo; groupings of given instances in unlabeled data. For the purposes of text analysis, a clustering method is a procedure that starts with a group of documents and attempts to organize them into relatively homogeneous groups called &amp;ldquo;clusters&amp;rdquo;. Clustering methods differ from classification methods (&lt;a href="http://scalar.usc.edu/works/lexos/glossary#supervised-learning"&gt;supervised learning&lt;/a&gt;) in that clustering attempts to form these groups entirely from the data, rather than by assigning documents to predefined groups with designated class labels.&lt;/p&gt;&lt;p&gt;Cluster analysis works by counting the frequency of terms occurring in documents and then grouping the documents based on similarities in these frequencies. When we observe that an author or text uses a term or a group of terms more than other authors or other texts do, we are innately using this technique. In making such claims, we rely on our memory, as well as sometimes unstated selection processes. It may be true, for instance, that Text A uses more terms relating to violence than Text B, but in fact, the difference between the two may be proportionally much less than the difference in the frequency of other terms such as &amp;ldquo;the&amp;rdquo; and &amp;ldquo;is&amp;rdquo; on which we do not traditionally base our interpretation. Cluster analysis leverages the ability of the computer to compare many more terms than is possible (or at least practical) for the human mind. It may therefore reveal patterns that can be missed by traditional forms of reading. Cluster analysis can be very useful for exploring similarities between texts or segments of texts and can also be used as a test for hypotheses you may have about your texts. But it is important to remember that the type of clustering discussed here relies on the frequency of terms, not their semantic qualities. As a result, it can only provide a kind of proxy for meaning. In order to use cluster analysis successfully to interpret literary texts, it is important to have a good understanding of how the process works.&lt;/p&gt;&lt;p&gt;Here, we acknowledge the many levels of expertise it requires to fully appreciate cluster analysis in general and specifically when choosing (i) metrics that define a distance between documents, (ii) metrics that manage the clustering of like documents, and (iii) clustering based on different features of the text (e.g. all or only the most frequent terms) (Eder, &amp;quot;Computational Stylistics and Biblical Translation&amp;quot;). But in all the details, we encourage the reader to seek the benefits of exploratory analysis like cluster analysis early and often, even as you are learning more about the statistical roots of different metrics. It always helps if you have&amp;nbsp; local statistican to consult..&amp;nbsp;&lt;/p&gt;&lt;h2&gt;&lt;strong&gt;Document Similarity&lt;/strong&gt;&lt;/h2&gt;&lt;p&gt;In traditional literary criticism, concepts like &amp;ldquo;genre&amp;rdquo; are frequently used to group texts. There may be some taxonomic criteria for assigning texts to these groups, but recent work in Digital Humanities (Moretti, Jockers, Underwood) have highlighted that such categories can be usefully re-examined using quantitative methods. In cluster analysis, the basis for dividing or assigning documents into clusters is a statistical calculation of their dis(similarity). Similar documents are ones in which there is considerable homogeneity in the frequency with which terms are observed to occur therein. Documents are dissimilar when term frequencies are more heterogeneous.&lt;/p&gt;&lt;p&gt;A clearer picture emerges of this definition of similarity if we examine how it is measured. Imagine three documents as points within a coordinate space.&lt;/p&gt;&lt;p&gt;&lt;strong&gt;&lt;a class="inline" resource="media/cluster-analysis-chart" href="media/ClusterChart1.PNG"&gt;&lt;/a&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;Document A can be imagined to be more similar to Document B than to Document C by using the &lt;a href="http://scalar.usc.edu/works/lexos/glossary#distance-metric"&gt;distance&lt;/a&gt; between them as a metric. Simply draw a straight line between the points, and the documents with the shortest line between them are the most similar. When using cluster analysis for the study of texts, we take as our premise the idea that this notion of similarity measured as proximity may correlate to a range of historical and stylistic relationships amongst the documents in our corpus.&lt;/p&gt;&lt;p&gt;The graph above represents the end of the process. In order to plot our documents in coordinate space, we must first determine the coordinates. This is done by counting the terms in our documents to produce a &lt;a data-mce-href="http://scalar.usc.edu/works/lexos/glossary#document-term-matrix" href="http://scalar.usc.edu/works/lexos/glossary#document-term-matrix"&gt;document-term matrix&lt;/a&gt;. For instance a part of the document-term matrix that produced the graph above might be:&lt;/p&gt;&lt;table border="1" width="100%"&gt;&lt;tbody&gt;&lt;tr&gt;&lt;th&gt;&amp;nbsp;&lt;/th&gt;&lt;th&gt;man&lt;/th&gt;&lt;th&gt;woman&lt;/th&gt;&lt;/tr&gt;&lt;tr&gt;&lt;td&gt;Document A&lt;/td&gt;&lt;td&gt;5&lt;/td&gt;&lt;td&gt;4&lt;/td&gt;&lt;/tr&gt;&lt;tr&gt;&lt;td&gt;Document B&lt;/td&gt;&lt;td&gt;4&lt;/td&gt;&lt;td&gt;5&lt;/td&gt;&lt;/tr&gt;&lt;tr&gt;&lt;td&gt;Document C&lt;/td&gt;&lt;td&gt;1&lt;/td&gt;&lt;td&gt;3&lt;/td&gt;&lt;/tr&gt;&lt;/tbody&gt;&lt;/table&gt;&lt;p&gt;The list of term counts for each document is called a &lt;a data-mce-href="http://scalar.usc.edu/works/lexos/glossary#document-vector" href="http://scalar.usc.edu/works/lexos/glossary#document-vector"&gt;document vector&lt;/a&gt;. Representing the text as a vector of term counts allows us to calculate the distance or dissimilarity between documents. We can easily convert the document-term matrix into a &amp;quot;distance matrix&amp;quot; (also called a &amp;quot;dissimilarity matrix&amp;quot;) by taking the difference between the term counts for each document vector.&lt;/p&gt;&lt;table border="1" width="100%"&gt;&lt;tbody&gt;&lt;tr&gt;&lt;td&gt;Man&lt;/td&gt;&lt;td&gt;Document A&lt;/td&gt;&lt;td&gt;Document B&lt;/td&gt;&lt;td&gt;Document C&lt;/td&gt;&lt;/tr&gt;&lt;tr&gt;&lt;td&gt;Document A&lt;/td&gt;&lt;td&gt;-&lt;/td&gt;&lt;td&gt;1&lt;/td&gt;&lt;td&gt;4&lt;/td&gt;&lt;/tr&gt;&lt;tr&gt;&lt;td&gt;Document B&lt;/td&gt;&lt;td&gt;1&lt;/td&gt;&lt;td&gt;-&lt;/td&gt;&lt;td&gt;3&lt;/td&gt;&lt;/tr&gt;&lt;tr&gt;&lt;td&gt;Document C&lt;/td&gt;&lt;td&gt;4&lt;/td&gt;&lt;td&gt;3&lt;/td&gt;&lt;td&gt;-&lt;/td&gt;&lt;/tr&gt;&lt;/tbody&gt;&lt;/table&gt;&amp;nbsp;&lt;p&gt;The distance from A to B is 1 while the distance from B to C is 3. Documents A and B form a cluster because the distance between them is shorter than between either and Document C.&lt;/p&gt;&lt;p&gt;Notice that the table above reproduces only the portion of the document vectors representing the frequency of the word &amp;ldquo;man&amp;rdquo;. Adding the &amp;ldquo;woman&amp;rdquo; portion creates considerable difficulties for us if we are trying to represent the data in rows and columns. That is because each term in the document vector represents a separate dimension of that vector. The full text of a document may be represented by a vector with thousands of dimensions. Imagine a spreadsheet with thousands of individual sheets, one for each term in the document, and you get the idea. In order for the human mind to interpret this data, we need to produce a flattening, or &lt;a data-display-content-preview-box="true" href="glossary#dimensionality-reduction"&gt;dimensionality reduction&lt;/a&gt;, of the whole distance matrix. The computer does this by algorithmically going through the distance matrix and adjusting the distance between each document vector on the &lt;em&gt;observed distances&lt;/em&gt; between each of the terms. There are, in fact, different algorithms for doing this, and part of a successful use of cluster analysis involves choosing the algorithm best suited for the materials being examined.&lt;/p&gt;&lt;p&gt;Many discussions of this process begin with the notion of &lt;a data-display-content-preview-box="true" href="glossary#feature-selection"&gt;feature selection&lt;/a&gt;. In text analysis, this equates to determining what features of the text make up the document vector. The procedure for feature selection is essentially the processes of &lt;a data-display-content-preview-box="true" href="glossary#scrubbing"&gt;scrubbing&lt;/a&gt;, &lt;a data-display-content-preview-box="true" href="glossary#cutting"&gt;cutting&lt;/a&gt;, and &lt;a data-display-content-preview-box="true" href="glossary#tokenization"&gt;tokenization&lt;/a&gt;. You may also perform certain &lt;a data-display-content-preview-box="true" href="glossary#normalization"&gt;normalization&lt;/a&gt; measures that modify the term frequencies in order to account for differences in document length. Depending on how you perform these tasks, the results of cluster analysis can be very different.&lt;br /&gt;&lt;br /&gt;One of the factors that will influence your results is your choice of &lt;a data-display-content-preview-box="true" href="glossary#distance-metric"&gt;distance metric&lt;/a&gt;. The distance metric is essentially how you define the difference between your documents. A simple example using a distance metric might be the words &lt;em&gt;cat&lt;/em&gt; and &lt;em&gt;can&lt;/em&gt; (think of these words as documents composed of vectors of three letters each). A distance metric called &lt;a data-display-content-preview-box="true" href="glossary#edit-distance"&gt;edit distance&lt;/a&gt; can be defined as the number of character changes required to transform &lt;em&gt;cat&lt;/em&gt; into &lt;em&gt;can&lt;/em&gt; (here, just one change = 1). The difference between &lt;em&gt;cat&lt;/em&gt; and &lt;em&gt;call&lt;/em&gt; would be 2. Edit distances are very good for measuring the distance between short strings of characters like individual words, but they are unwieldy for longer document vectors. So for whole texts we need some other way of defining distance.&amp;nbsp;&lt;/p&gt;&lt;p&gt;The &lt;a data-display-content-preview-box="true" href="http://scalar.usc.edu/works/lexos/glossary#euclidean-distance"&gt;Euclidean distance&lt;/a&gt; metric measures the magnitude of the difference in distance between two document vectors. The Euclidean distance is essentially the length of a line between a point representing a term on one vector and a point representing the same term on the other. You might then decide to take the average Euclidean distance for all points and treat that as the measure of document distance. Statisticians have developed a number of metrics based on modifications of Euclidean distance that can serve as alternative ways of defining document distance. Another approach is to measure the &lt;a data-display-content-preview-box="true" href="glossary#similarity"&gt;similarity&lt;/a&gt; of the two vectors. It may help the reader to visualize two documents with N unique words as represented by two vectors sticking out into N-space. Given two documents, a common method of computing distance is to calculate the &lt;a data-display-content-preview-box="true" href="glossary#cosine-similarity"&gt;cosine similarity&lt;/a&gt;, which is the angle between the two document vectors. Cosine similarity is not truly a measure of distance, but it can be converted into a distance measure by subtracting the cosine similarity value (which varies between 0 and 1) from 1.&lt;/p&gt;&lt;p&gt;The difference between Euclidean and non-Euclidean metrics for measuring document distance is largely one of perspective. Euclidean distances tend to measure the distance between documents at some point along the vector where the documents are already quite distinct from one another. In longer documents, this distinction may correlate to a lot of terms which are found in one document but not the other. In the document-term matrix, counts for these terms in the documents where they do not occur are recorded as 0. A distance matrix in which most of the elements are zero is called a &lt;a data-display-content-preview-box="true" href="glossary#sparse-matrix"&gt;sparse matrix&lt;/a&gt;. The more dimensions there are in the document vectors, the more likely it is that the distance matrix will be sparse. Many &lt;i&gt;Hapax legomena&lt;/i&gt; (terms occurring only once) in your documents, for instance, will very likely produce a sparse document-term matrix.&amp;nbsp;&lt;/p&gt;&lt;p&gt;Measuring Euclidean distance in a sparse matrix can affect the way clustering takes place. We would certainly expect this to be the case if one document was much longer than the other. Using cosine similarity is one way to address this problem since the angle between document vectors does not change depending on the location of points along the vector. There are a large variety of variants on these basic Euclidean and non-Euclidean approaches to defining the distance metric for clustering. For further discussion, see &lt;a data-display-content-preview-box="true" href="choosing-a-distance-metric"&gt;Choosing a Distance Metric&lt;/a&gt;.&lt;/p&gt;&lt;p&gt;Other factors influencing your results will depend on the type of cluster analysis you use, your choice of linkage method, the choice of token-type, and the size of the sample of terms considered (e.g., the whole text, as opposed to the top 100 most frequent terms).&lt;/p&gt;&lt;h2&gt;Types of Cluster Analysis&lt;/h2&gt;&lt;p&gt;There are two main approaches to clustering: &lt;a data-display-content-preview-box="true" href="glossary#hierarchical-cluster-analysis"&gt;hierarchical&lt;/a&gt; and &lt;a data-display-content-preview-box="true" href="glossary#partitioning-cluster-analysis"&gt;partitioning&lt;/a&gt;. &lt;a data-display-content-preview-box="true" href="hierarchical-clustering"&gt;Hierarchical clustering&lt;/a&gt; attempts to divide documents into a branching tree of groups and sub-groups, whereas partitioning methods attempt to assign documents to a pre-designated number of clusters. Clustering methods may also be described as &lt;a data-display-content-preview-box="true" href="glossary#exclusive-cluster-analysis"&gt;exclusive&lt;/a&gt;, generating clusters in which no document can belong to more than one cluster, or &lt;a data-display-content-preview-box="true" href="glossary#overlapping-cluster-analysis"&gt;overlapping&lt;/a&gt;, in which documents may belong belong to multiple clusters. Hierarchical methods allow &lt;em&gt;clusters&lt;/em&gt; to belong to other clusters, whereas &lt;a data-display-content-preview-box="true" href="glossary#flat-cluster-analysis"&gt;flat&lt;/a&gt; methods do not allow for this possibility.&lt;i&gt; Lexos&lt;/i&gt; implements two types of cluster analysis: a form of hierarchical clustering called &lt;a data-display-content-preview-box="true" href="glossary#agglomerative-hierarchical-clustering"&gt;agglomerative hierarchical clustering&lt;/a&gt; and a form of flat partitioning called &lt;a data-display-content-preview-box="true" href="glossary#k-means-clustering"&gt;K-Means&lt;/a&gt;.&lt;/p&gt;&lt;p&gt;While a full discussion of the many types of cluster analysis is beyond the scope of this work, we may note two other methods that are commonly used in literary text analysis. One such method is topic modeling, which generates clusters of words that appear in close proximity to each other within the corpus. The most popular tool for topic modeling is &lt;a href="http://mallet.cs.umass.edu/"&gt;Mallet&lt;/a&gt;, and the Lexos Multicloud tools allows you to generate topic clouds of the resulting clusters from Mallet data. Another type of clustering is often referred to as &lt;a href="https://en.wikipedia.org/wiki/Community_structure"&gt;community detection&lt;/a&gt;, where algorithms are used to identify clusters of related nodes in a network. More information about community detection in network data can be found here [needs a link].&lt;/p&gt;&lt;h2&gt;Strengths and Limitations of Cluster Analysis&lt;/h2&gt;&lt;p&gt;Cluster analysis has been put to good use for a variety of purposes. Some of the most successful work using cluster analysis has been in the area of authorship attribution, detection of collaboration, source study, and translation (REFs). Above all, it can be a useful provocation, helping to focus enquiry on texts or sections of texts that we might otherwise ignore.&lt;/p&gt;&lt;p&gt;It is important to be aware that there is not a decisive body of evidence supporting the superiority of one clustering method over another. Different methods often produce very different results. There is also a fundamental circularity to cluster analysis in that it seeks to discover structure in data by imposing structure on it. While these considerations may urge caution, it is equally important to remember that they have analogues in traditional forms of an analysis and interpretation.&lt;/p&gt;&lt;p&gt;Regardless of which method is chosen, we should be wary of putting too much stock in any single result. Cluster analysis is most useful when it is repeated many times with slight variations to determine which results are most useful. One of the central concerns will always be the &lt;strong&gt;validity&lt;/strong&gt; of algorithmically produced clusters. In part, this is a question of statistical validity based on the nature of texts used and the particular implementation chosen for clustering. There is ongoing research on what statistical criteria makes a cluster a &amp;quot;good cluster&amp;quot;--and how to learn that is for a given data set--but there is very little consensus that is of practical use for textual analysis. In Lexos, we include a statistical measure called the &lt;a data-display-content-preview-box="true" href="glossary#silhouette-score"&gt;Silhouette Score&lt;/a&gt;, which gives a general indication of how well documents lie within their clusters. Silhouette scores do not reply on knowing class labels beforehand.&amp;nbsp;However, a high or low Silhouette Score should not be taken to mean that the clustering is better or worse. It is merely one of many possible measures we could use. [See http://blog.data-miners.com/2011/03/cluster-silhouettes.html for further information in Silhouette Scores.] For further discussion, see &lt;a data-display-content-preview-box="true" href="establishing-robust-clusters"&gt;Establishing Robust Clusters&lt;/a&gt;.&lt;/p&gt;&lt;p&gt;There is also the more fundamental question of whether similarity as measured by distance metrics corresponds to similarity as apprehended by the human psyche or similarity in terms of the historical circumstances that produced the texts under examination. One of the frequent complaints about cluster analysis is that, in reducing the dimensionality of the documents within the cluster, it occludes access to the content&amp;mdash;especially the semantics of the content&amp;mdash;responsible for the grouping of documents. Point taken. Treating documents as vectors of term frequencies ignores information, but the success of distance measures on n-dimensional vectors of term counts is clear: cluster analysis continues to support exploration that helps define next steps, for example, new ways of segmenting a set of old texts.&lt;/p&gt;&lt;h2 id="sources"&gt;Sources:&lt;/h2&gt;&lt;p&gt;&lt;a href="http://www.mimuw.edu.pl/~son/datamining/DM/8-Clustering_overview_son.pdf"&gt;http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.332.4480&amp;amp;rep=rep1&amp;amp;type=pdf&lt;/a&gt;&lt;/p&gt;&lt;p&gt;&lt;a href="http://www.mimuw.edu.pl/~son/datamining/DM/8-Clustering_overview_son.pdf"&gt;http://www.mimuw.edu.pl/~son/datamining/DM/8-Clustering_overview_son.pdf&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;a href="http://www.daniel-wiechmann.eu/downloads/cluster_%20analysis.pdf"&gt;http://www.daniel-wiechmann.eu/downloads/cluster_%20analysis.pdf&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;a href="http://www.stat.wmich.edu/wang/561/classnotes/Grouping/Cluster.pdf"&gt;http://www.stat.wmich.edu/wang/561/classnotes/Grouping/Cluster.pdf&lt;/a&gt;&lt;/p&gt;</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-08-19T16:34:39+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:838227"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/cluster-analysis"/>
    <dcterms:references rdf:resource="http://scalar.usc.edu/works/lexos/media/cluster-analysis-chart"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/media/cluster-analysis-chart">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Media"/>
    <scalar:isLive>1</scalar:isLive>
    <art:thumbnail rdf:resource="http://scalar.usc.edu/works/lexos/media/ClusterChart1_thumb.PNG"/>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2015-08-03T20:05:23+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:170894"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/media/cluster-analysis-chart.1"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/media/cluster-analysis-chart.1"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/media/cluster-analysis-chart.1">
    <ov:versionnumber>1</ov:versionnumber>
    <dcterms:title>Cluster Analysis Chart</dcterms:title>
    <dcterms:description>Illustrates document similarity</dcterms:description>
    <art:url rdf:resource="http://scalar.usc.edu/works/lexos/media/ClusterChart1.PNG"/>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2015-08-03T20:05:23+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:427078"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/media/cluster-analysis-chart"/>
    <dcterms:isReferencedBy rdf:resource="http://scalar.usc.edu/works/lexos/cluster-analysis"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="urn:scalar:path:838227:839333:1">
    <scalar:urn rdf:resource="urn:scalar:path:838227:839333:1"/>
    <oac:hasBody rdf:resource="http://scalar.usc.edu/works/lexos/cluster-analysis.35"/>
    <oac:hasTarget rdf:resource="http://scalar.usc.edu/works/lexos/hierarchical-clustering.17#index=1"/>
    <rdf:type rdf:resource="http://www.openannotation.org/ns/Annotation"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/hierarchical-clustering">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2015-08-09T20:18:09+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:172202"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/hierarchical-clustering.17"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/hierarchical-clustering.17"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/hierarchical-clustering.17">
    <ov:versionnumber>17</ov:versionnumber>
    <dcterms:title>Hierarchical Clustering</dcterms:title>
    <dcterms:description>Manual page for the Lexos Hierarchical Clustering tool</dcterms:description>
    <sioc:content>Hierarchical cluster analysis is a good first choice when asking new questions about texts. Our experience has shown that this approach is remarkably versatile (REF). Perhaps more than any one individual method, the results from our cluster analyses continue to generate new interesting and focused questions.&lt;p&gt;Hierarchical clustering does not require you to choose the number of clusters to begin with. A dendrogram, a visual representation of the clusters, can be built by two methods. &lt;a data-display-content-preview-box="true" href="glossary#divisive-hierarchical-clustering"&gt;Divisive hierarchical clustering&lt;/a&gt; begins with only one cluster (consisting of all documents) and proceeds to cut it into separate &amp;ldquo;sub-clusters&amp;rdquo;, repeating the process until the criterion for dividing them has been exhausted. Alternately,&amp;nbsp;&lt;a data-display-content-preview-box="true" href="glossary#agglomerative-hierarchical-clustering"&gt;agglomerative hierarchical clustering&lt;/a&gt; begins with every document as its own cluster and then proceeds to assign these items to &amp;ldquo;super-clusters&amp;rdquo; based on the selected &lt;a data-display-content-preview-box="true" href="glossary#distance-metric"&gt;distance metric&lt;/a&gt; and &lt;a data-display-content-preview-box="true" href="glossary#linkage"&gt;linkage&lt;/a&gt; criteria (see below). Lexos offers a tool for performing agglomerative hierarchical clustering.&lt;/p&gt;&lt;p&gt;The clusters that result from hierarchical clustering are typically visualized with a two-dimensional tree diagram called a &lt;a data-display-content-preview-box="true" href="http://scalar.usc.edu/works/lexos/glossary#dendrogram"&gt;dendrogram&lt;/a&gt;.[There probably needs to be a short summary of the terms used in the video here (clade, leaf, simplicifolious, etc.)--in case people don&amp;#39;t want to watch the video.] For more information about the construction and interpretation of dendrograms in this method, see the video below:&lt;/p&gt;&amp;nbsp;&lt;p&gt;&lt;a class="inline" resource="how-to-read-a-dendrogram" href="https://www.youtube.com/watch?v=MX6AUX1b1w0"&gt;&lt;/a&gt;&lt;/p&gt;&amp;nbsp;&lt;p&gt;Since the resulting tree technically contains clusters at multiple levels, the result of the cluster analysis is obtained by &amp;ldquo;cutting&amp;rdquo; the tree at the desired level. Each connected component then forms a cluster for interpretation.&lt;/p&gt;&lt;p&gt;The results of hierarchical clustering and the topography of the resulting dendrogram may vary depending on &lt;a data-display-content-preview-box="true" href="glossary#distance-metric"&gt;distance metric&lt;/a&gt;,&amp;nbsp;&lt;a data-display-content-preview-box="true" href="glossary#linkage"&gt;linkage&lt;/a&gt; criterion used to form the clusters, and other factors such as tokenization and the number of most frequent words used. The distance metric is the measure used for defining what constitutes document similarity, how &amp;quot;far&amp;quot; (distance) one document is from another. [&lt;i style="font-weight: bold;"&gt;NOTE: i don&amp;#39;t think this next sentence is correct:&amp;nbsp;&lt;/i&gt;&lt;b&gt;The linkage criterion specifies which terms in the documents are used to assign the documents to a particular clade.] &amp;nbsp;&lt;/b&gt;&lt;i&gt;rather&lt;/i&gt;&lt;b&gt; ...&amp;nbsp;&lt;/b&gt;The linkage criterion specifies which distances between documents are used to define how similar a document is to a previously formed cluster. [This is an improvement. I want to get in there that the distances can be between, say subsets of terms, but it seems to be too much for a single sentence. Better to let the individual types of linkage illustrate what is involved.&lt;/p&gt;&lt;p&gt;Hierarchical clustering presents the user with three main challenges:&lt;/p&gt;&lt;ol&gt;&lt;li&gt;Which distance metric to use.&lt;/li&gt;&lt;li&gt;What type of linkage criterion to select.&lt;/li&gt;&lt;li&gt;Where to cut the tree.&lt;/li&gt;&lt;/ol&gt;&lt;p&gt;Each of these challenges will be considered in turn.&lt;/p&gt;&lt;h2&gt;Selecting a Distance Metric&lt;/h2&gt;&lt;p&gt;This is one of the least well-understood (and least well-documented) aspects of the hierarchical clustering method. Since we are representing texts as document vectors, it makes sense to define document similarity by comparing the two vectors. One way to do this is to select points (terms) on the vectors of two documents and measure the distance between them. If the two vectors are visualized as lines in a triangle, the hypotenuse between these lines can be used as a measure of the distance between the two documents. This standard means of measuring how far apart two documents are is known as &lt;a data-display-content-preview-box="true" href="glossary#euclidean-distance"&gt;Euclidean distance&lt;/a&gt;. Euclidean distance can be calculated using the square root of the sum of the squares of the differences between corresponding coordinates of points on the document vectors. Despite this mouthful, Euclidean distance is an excellent metric to begin with (and we have had good success with it). Non-Euclidean methods are also possible. For instance, another commonly used measure is &lt;a data-display-content-preview-box="true" href="glossary#cosine-similarity"&gt;cosine similarity&lt;/a&gt;, which relates the distance between the two documents to the angle between their two vectors. While Euclidean distance will vary depending on which points on the vector are used to calculate the distance, the angle between the vectors does not change. Both of these measures are good starting points. Another is Squared Euclidean distance. This is the same as the Euclidean distance, but it does not take the square root as the final part of the calculation. Because it omits this extra step Squared Euclidean distance can be a good choice for larger data sets that take longer to process. Lexos provides a variety of options for use as distance metrics. Further discussion can be found under &lt;a data-display-content-preview-box="true" href="choosing-a-distance-metric"&gt;Choosing a Distance Metric&lt;/a&gt;.&lt;/p&gt;&lt;h2&gt;Choosing a Linkage Method&lt;/h2&gt;&lt;p&gt;The second choice that must be made before running a clustering algorithm is the linkage method. At each stage of the clustering process a choice must be made about whether two clusters should be joined (and recall that a single document itself forms a cluster at the lowest level of the hierarchy). An intuitive means for doing this is to join the cluster containing a point (e,g, a term frequency) closest to the current cluster. This is known as &lt;a data-display-content-preview-box="true" href="glossary#single-linkage"&gt;single linkage&lt;/a&gt;, which joins clusters based on only a single point. Single linkage does not take into account the rest of the points in the cluster, and the resulting dendrograms tend to have spread out clusters. This process is called &amp;quot;chaining&amp;quot;. &lt;a data-display-content-preview-box="true" href="glossary#complete-linkage"&gt;Complete linkage&lt;/a&gt; uses the opposite approach. It takes the two points furthest apart between the current cluster and the others. The cluster with the shortest distance to the current cluster is joined to it. Complete linkage thus takes into account all the points on the vector that come before the one with the maximum distance. It tends to produce compact, evenly distributed clusters in the resulting dendrograms. &lt;a data-display-content-preview-box="true" href="glossary#average-linkage"&gt;Average linkage&lt;/a&gt; is a compromise between single and complete linkage. It takes the average distance of all the points in each cluster and uses the shortest average distance for deciding which cluster should be joined to the current one. We have had good success with average linkage. The &lt;a data-display-content-preview-box="true" href="glossary#weigted-linkage"&gt;weighted average&lt;/a&gt; linkage performs the average linkage calculation but weights the distances based on the number of terms in the cluster. It therefore may be a good option when there is significant variation in the size of the documents under examination. Another commonly used form of linkage (not currently available in Lexos) is &lt;a data-display-content-preview-box="true" href="glossary#wards-criterion"&gt;Ward&amp;#39;s criterion&lt;/a&gt;, which attempts to minimize the differences in cluster size as the dendrogram is built. It may not be appropriate for use with documents of variable size. [I have found these concise but comprehensible accounts mostly at http://academic.reed.edu/psychology/stata/analyses/advanced/agglomerative.html, but I have modified the phrasing.] Visualizations of the differences between the linkage criteria can be seen &lt;a href="http://www.molmine.com/help/algorithms/linkage.htm"&gt;here&lt;/a&gt;. [We should probably look for or make an example with better graphics.] Which linkage criterion you choose depends greatly on the variability of your data and your expectations of its likely cluster structure. The fact that it is very difficult to predict this in advance may explain why the &amp;quot;compromise&amp;quot; of average linkage has proved successful for us.&lt;/p&gt;&lt;h2&gt;Cutting the Dendrogram&lt;/h2&gt;&lt;p&gt;Once the dendrogram has been generated, every document leaf will form its own cluster and all documents will belong to a single cluster at the root. In between, there may be any number of clusters formed at differing levels of the hirerarchy. Not all of these clusters will necessarily be meaningful. For example, if you are trying to test the authorship of Shakespearean plays, it may not be significant that &lt;i&gt;Macbeth&lt;/i&gt; and &lt;i&gt;A Midsummer Night&amp;#39;s Dream&lt;/i&gt; fall within the same cluster. It will be more interesting if a Renaissance play we do not know to be by Shakespeare falls within a cluster containing the above plays and not into clusters containing plays by other authors. On the other hand, if we are interested in the question of genre, we might be very interested to know whether &lt;i&gt;Richard II&lt;/i&gt;, normally considered a history play, clusters with the tragedy of &lt;i&gt;Macbeth&lt;/i&gt; or the comedy of &lt;i&gt;A Midsummer Night&amp;#39;s Dream&lt;/i&gt;. In practice, these sorts of considerations will cause us to draw a line on the dendrogram (often at a particular branch height) below which we will not consider clusters significant. This is known as cutting the dendrogram. Where to draw the line can be an impressionistic exercise. Like our choice of linkage, it will depend a great deal on our expectations of our data. Lexos provides two methods of aiding us. Lexos automatically cuts the tree at a threshold set to 70% of the larger of the first two rows in the distance matrix. All connected nodes below this threshold will be given a common color. All branches connecting nodes with distances greater than or equal to the threshold are colored blue. [This is from the scipy documentation: http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.dendrogram.html#scipy.cluster.hierarchy.dendrogram] Note that this is a default behavior and may therefore not be entirely appropriate for your material. Lexos also allows you to &amp;quot;prune&amp;quot; your dendrogram by restricting the number of leaves displayed. The primary goal of this option is to prevent overlapping labels in dendrograms containing many documents, but it can also help you to identify the most appropriate level to cut our dendrogram.&lt;/p&gt;&lt;p&gt;It should be clear from the above that interpreting dendrograms requires both an understanding of the choice of implementation and an understanding of the content of the materials being clustered. Furthermore, the structure of of the dendrogram and its interpretation are highly dependent on our expectations about the text we are studying. This epistemological loop is well known in the Humanities, where it is taken for granted that one&amp;#39;s perspective and biases influence interpretation. In hierarchical cluster analysis, the decision-making required for implementation builds these limitations into the method, but hopefully calls attention to them as well.&lt;/p&gt;&lt;h2&gt;Further Considerations&lt;/h2&gt;&lt;p&gt;We end with some miscellaneous issues which you should be aware of in choosing hierarchical clustering as a method. First, it does not scale well. If you have a large number of documents, or large documents, the number of computations can theoretically be a strain on a computer&amp;#39;s processing power. We have not yet established a threshold where this becomes problematic (especially since it will vary on different machines), but, if you appear to be encountering this problem, trying a simpler distance metrics like Squared Euclidean may help. If you do manage to produce a dendrogram with large numbers of leaves, you may trouble reading it because the leaf labels overlap. In Lexos, limiting the number of leaves displayed may help.&lt;/p&gt;&lt;p&gt;These are largely practical situations, but there are also some conceptual ones. In hierarchical clustering, all items (documents and the terms they contain) are forced into clusters, a scenario that may not accurately reflect the relationships of the original texts. Another issue is that hierarchical clustering assigns documents to clusters early during the process and has no method for undoing that partitioning based on data it encounters later. If that appears to be a problem, we suggest trying K-Means clustering, which adjusts cluster membership at each step.&lt;/p&gt;&lt;p&gt;Statisticians have identified many strengths and shortcomings of hierarchical clustering as a method, and there is ongoing research on the most appropriate distance measures and linkage criteria (much of it using data unlike that employed in literary text analysis). In our test cases, we have typically found that the Euclidean metric with average linkage provides good results. However, Lexos allows you, even encourages you, to apply a number of algorithms and compare the results. This may be one method of establishing whether a particular clustering is valuable. See further &lt;a data-display-content-preview-box="true" href="establishing-robust-clusters"&gt;Establishing Robust Clusters&lt;/a&gt;.&lt;/p&gt;</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-08-22T14:11:12+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:839333"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/hierarchical-clustering"/>
    <dcterms:references rdf:resource="http://scalar.usc.edu/works/lexos/how-to-read-a-dendrogram"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="urn:scalar:path:838227:656924:2">
    <scalar:urn rdf:resource="urn:scalar:path:838227:656924:2"/>
    <oac:hasBody rdf:resource="http://scalar.usc.edu/works/lexos/cluster-analysis.35"/>
    <oac:hasTarget rdf:resource="http://scalar.usc.edu/works/lexos/k-means-clustering.11#index=2"/>
    <rdf:type rdf:resource="http://www.openannotation.org/ns/Annotation"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/k-means-clustering">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2015-08-11T22:00:21+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:172448"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/k-means-clustering.11"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/k-means-clustering.11"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/k-means-clustering.11">
    <ov:versionnumber>11</ov:versionnumber>
    <dcterms:title>K-Means Clustering</dcterms:title>
    <dcterms:description>The main overview page for K-means clustering</dcterms:description>
    <sioc:content>&lt;h2 id="k-means-clustering"&gt;K-Means Clustering&lt;/h2&gt;&lt;p&gt;K-Means clustering partitions a set of documents into a number of groups or clusters in a way that minimizes the variation within clusters. The &amp;quot;K&amp;quot; refers to the number of partitions, so for example, if you wish to see how your documents might cluster into three (3) groups, you would set K=3. [footnote? &amp;nbsp;You know like if you ask a mathematician, &amp;quot;Give me a number, any number&amp;quot; and she replies, &amp;quot;How about K.&amp;quot;]&lt;br /&gt;&lt;br /&gt;&lt;em&gt;[not sure about this paragraph: Thus it uses variation of the terms within documents, rather than distances between them, to form clusters. Unlike hierarchical clustering, K-Means clustering requires us to choose the number of clusters (K) we wish to produce. However, we do not need to choose a distance metric. As a result, K-Means can be a good alternative to hierarchical clustering for large data sets since it is less computationally intensive.]&lt;/em&gt;&lt;/p&gt;&lt;p&gt;When thinking of K-means clustering, we recommend that you think of each of your documents as represented by a single&amp;nbsp;(x,y) point on a two-dimensional&amp;nbsp;coordinate plane. In this view, a cluster is a collection&amp;nbsp;of documents (points) that are close to one another and together form a group. Assigning documents to a specific cluster amounts to determining which cluster &amp;quot;center&amp;quot; is closest to your document.&lt;br /&gt;&lt;br /&gt;[show an image of circles that contain points thereby forming clusters]&lt;br /&gt;&lt;br /&gt;The &lt;strong&gt;algorithm&lt;/strong&gt; (general procedure or &amp;quot;recipe&amp;quot;) for applying K-means to your collection of documents is described next. Again, the overall goal is to partition your documents into K non-empty subsets.&lt;/p&gt;&lt;ol&gt;&lt;li&gt;Decide on the number of clusters you wish to form. So yes,&amp;nbsp;&lt;em&gt;you&lt;/em&gt;&amp;nbsp;must pick a value for K&amp;nbsp;&lt;em&gt;a priori&lt;/em&gt;.&lt;/li&gt;&lt;li&gt;The algorithm will compute&amp;nbsp;a &amp;quot;center&amp;quot; or centroid&amp;nbsp;for each cluster. The centroid is the center (mean point) of a cluster. The procedure for creating centroids at the very start can be varied and is discussed below.&lt;/li&gt;&lt;li&gt;Assign each of your documents to the cluster with the nearest centroid.&lt;/li&gt;&lt;li&gt;Repeat steps 2 and 3, thereby&amp;nbsp;re-calculating the locations of centroids for the documents in each cluster and reassigning documents to the cluster with the closest center. The algorithm continues&amp;nbsp;until no documents are reassigned to different clusters.&lt;/li&gt;&lt;/ol&gt;&lt;p&gt;&lt;label for="max_iter" style="display: block; margin-bottom: 5px; font-size: 14px; line-height: 34px; clear: right; float: left; color: rgb(0, 0, 0); font-family: Lato, sans-serif;"&gt;&amp;nbsp;&lt;/label&gt;&lt;br /&gt;&lt;br /&gt;&lt;strong&gt;Required Settings:&lt;/strong&gt;&lt;br /&gt;&lt;br /&gt;&lt;span style="line-height: 16.64px;"&gt;&lt;strong&gt;K value&lt;/strong&gt;:&amp;nbsp;There is no obvious way to choose the number of clusters. It can be helpful to perform hierarchical clustering before performing K-Means clustering, as the resulting dendrogram may suggest a certain number of clusters that is likely to produce meaningful results. The K-means procedure is very sensitive to the position of the initial seeds, although employing the K-means++ setting can help to constrain this placement.&lt;/span&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Method of Visualization:&lt;/b&gt;&lt;br /&gt;As mentioned earlier, K-Means clustering is generally visualized on a two-dimensional plane with the distance between cluster members (documents) indicated by coordinates. Trapezoidal polygons known as Voronoi cells may be drawn around the cluster centroids to indicate which documents fall in which clusters. Another way of visualizing the results of K-Means clustering is with Principal Component Analysis (PCA), where dots on the plane are colored to mark their cluster membership. Both visualization approaches can you&amp;nbsp;judge distances between clusters.&lt;/p&gt;&lt;p&gt;&lt;br /&gt;&lt;strong&gt;Advanced Settings:&lt;/strong&gt;&lt;br /&gt;&lt;span style="line-height: 16.64px;"&gt;Since cluster membership is adjusted at each stage of the process by the re-location of the centroids, the number of iterations required and other factors can be adjusted to select a cutoff point for the algorithm or a desired threshold for convergence of different clusters. As with the initial choice of cluster numbers, there are no hard and fast rules for how these factors should be applied. &amp;nbsp;For most users, we (strongly?) recommend the default settings be used, that is, the user need not enter or change any of these settings and&amp;nbsp;&lt;em&gt;Lexos&lt;/em&gt;&amp;nbsp;will apply the default values.&lt;/span&gt;&lt;br /&gt;&lt;br /&gt;&lt;strong&gt;Maximum number of iterations:&lt;/strong&gt;&lt;br /&gt;As noted above, the K-means algorithm will continue to re-compute centroids for each cluster until all documents settle down into &amp;quot;final, home&amp;quot; clusters. It is possible that a situation occurs where a document continues to toggle back and forth between two clusters. This value avoids an endless, or at least an unnecessary number of iterations with little change.&lt;br /&gt;&lt;br /&gt;&lt;strong&gt;Method of Initialization:&lt;/strong&gt;&lt;br /&gt;Your results of using K-means on a collection of documents can vary significantly depending on the&amp;nbsp;&lt;em&gt;initial&lt;/em&gt; choice of centroids. &amp;nbsp;In &lt;em&gt;Lexos&lt;/em&gt;&amp;nbsp;the user is offered two choices: K-Means++ and Random. When using K-Means++, the default setting in&amp;nbsp;&lt;em&gt;Lexos&lt;/em&gt;, the center of the first of the K clusters is chosen at random (typically by picking any one of the documents in the starting set as representative of a center of a future cluster). The remaining (K-1) cluster centers are then&amp;nbsp;chosen from the remaining documents by computing a probability proportional to the distances of the centers already chosen. O&lt;span style="line-height: 16.64px;"&gt;nce all centroids are chosen, normal K-Means clustering takes place. &lt;/span&gt;A &amp;quot;random seed&amp;quot; approach is used in which the locations of all&amp;nbsp;centroids at the initial&amp;nbsp;stage are generated randomly. It is best to experiment multiple times with different random seeds.&amp;nbsp;&lt;/p&gt;&lt;p&gt;&lt;br /&gt;&lt;strong&gt;&lt;span style="color: rgb(0, 0, 0); font-family: Lato, sans-serif; font-size: 14px; line-height: 34px;"&gt;Number of Iterations with Different Centroids: &amp;nbsp;&lt;/span&gt;&lt;/strong&gt;&lt;span style="color: rgb(0, 0, 0); font-family: Lato, sans-serif; font-size: 14px; line-height: 34px;"&gt;Given the sensitivity of final clusters on the choice of initial centroids,&amp;nbsp;&lt;em&gt;Lexos&lt;/em&gt;&amp;nbsp;uses a default setting of running &lt;strong&gt;?? (note: scikit learn says N=10; we have it set at 300?)&lt;/strong&gt;&amp;nbsp;trials, each trial using different centroid starting locations (or seeds).&lt;br /&gt;&lt;br /&gt;&lt;strong&gt;Relative Tolerance:&lt;/strong&gt;&lt;br /&gt;This setting allows an expert user to vary the rate of convergence of the algorithm. &lt;em&gt;(I&amp;#39;m not convinced this is a useful setting for our users?)&lt;/em&gt;&lt;/span&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;span style="line-height: 1.6;"&gt;The reliability of K-Means clustering can be evaluated by many statistical procedures. Lexos provides one criterion, the Silhouette Score, which is also used to evaluate the reliability of results follwoing&amp;nbsp;hierarchical clustering. See &lt;/span&gt;&lt;a data-display-content-preview-box="true" style="line-height: 1.6;" href="cluster-analysis"&gt;Cluster Analysis&lt;/a&gt;&lt;span style="line-height: 1.6;"&gt; for further discussion.&lt;/span&gt;&lt;/p&gt;&lt;br /&gt;&lt;br /&gt;&amp;nbsp;</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3689"/>
    <dcterms:created>2016-03-12T12:32:09+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:656924"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/k-means-clustering"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="urn:scalar:path:838227:431807:3">
    <scalar:urn rdf:resource="urn:scalar:path:838227:431807:3"/>
    <oac:hasBody rdf:resource="http://scalar.usc.edu/works/lexos/cluster-analysis.35"/>
    <oac:hasTarget rdf:resource="http://scalar.usc.edu/works/lexos/choosing-a-distance-metric.5#index=3"/>
    <rdf:type rdf:resource="http://www.openannotation.org/ns/Annotation"/>
  </rdf:Description>

  <rdf:Description rdf:about="urn:scalar:path:838227:838304:4">
    <scalar:urn rdf:resource="urn:scalar:path:838227:838304:4"/>
    <oac:hasBody rdf:resource="http://scalar.usc.edu/works/lexos/cluster-analysis.35"/>
    <oac:hasTarget rdf:resource="http://scalar.usc.edu/works/lexos/establishing-robust-clusters.4#index=4"/>
    <rdf:type rdf:resource="http://www.openannotation.org/ns/Annotation"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/establishing-robust-clusters">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2015-08-12T00:21:10+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:172452"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/establishing-robust-clusters.4"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/establishing-robust-clusters.4"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/establishing-robust-clusters.4">
    <ov:versionnumber>4</ov:versionnumber>
    <dcterms:title>Establishing Robust Clusters</dcterms:title>
    <dcterms:description>Detailed discussion of how to handle cluster robustness</dcterms:description>
    <sioc:content>One of the most vexing questions in the use of cluster analysis for computational stylistics is how we distinguish &amp;quot;good&amp;quot; clusters from clusters that are mere &amp;quot;noise&amp;quot;, whether generated by our data or by our choice of implementations? Ideally, we want to generate &amp;quot;robust&amp;quot; clusters, by which we mean that they stand up to some measure of scrutiny. We can define this in many ways. If we cut several documents into segments and the individual segments of each document are clustered together in opposition to segments of other documents, we can assume that the clustering process has captured something meaningful, if only the distinctiveness of our original documents. When less predictable effects occur&amp;mdash;say one segment clusters with the &amp;quot;wrong&amp;quot; document&amp;mdash;we have to conclude either that there is something sub-optimal about our clustering procedure or that we have found something really interesting. Thus our intuitive sense of &amp;quot;surprise&amp;quot; at our results may be a measure of a weak clustering, but this &amp;quot;surprise&amp;quot; is also the goal of our analysis&amp;mdash;within reason. Below we discuss some methods of striking a balance between interpretations based on unexpected clusterings. We examine how we can be relatively sure that our clusters&amp;mdash;and thus our conclusions based on them&amp;mdash;are robust.&lt;br /&gt;&lt;br /&gt;The Holy Grail for some would be a statistical measure of with which to assess the &amp;quot;validity&amp;quot; of our clusters. A number of such measures exist, but their usefulness for a wide variety of data, and for the types of questions humanists typically ask of their data is an open question. Lexos offers one measure, the &lt;a href="silhouette-scores"&gt;Silhouette Score&lt;/a&gt;, which attempts to quantify our confidence that individual documents have been assigned to the &amp;quot;correct cluster&amp;quot;. However, we recommend that you integrate non-statistical approaches into your workflow. Creating a number of different cluster analyses with slightly different settings to see how well the clusters hold up to these &amp;quot;tweaks&amp;quot; is probably the most reliable way to establish confidence in your clusters. Drout et al. have outlined a variety of procedures in &lt;a target="_blank" href="http://www.palgrave.com/us/book/9783319306278"&gt;Beowulf Unlocked: New Evidence from Lexomic Analysis (2016)&lt;/a&gt;. [Extracts or summaries should be added here.]</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-08-19T17:45:48+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:838304"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/establishing-robust-clusters"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/cut">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/6902"/>
    <dcterms:created>2015-06-02T07:38:01+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:158924"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/cut.7"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/cut.7"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/cut.7">
    <ov:versionnumber>7</ov:versionnumber>
    <dcterms:title>The Cutter Tool</dcterms:title>
    <dcterms:description>Manual page for the Lexos Cutter tool</dcterms:description>
    <sioc:content>&lt;p&gt;The Lexos &lt;strong&gt;Cutter&lt;/strong&gt; tool allows you to divide your texts into multiple segments. Each segment is treated by Lexos exactly like any other document. You can perform individual scrubbing actions, create word clouds of segments, and cluster the segments of documents just as you would any other text.&lt;/p&gt;&lt;h3&gt;Cutting Options&lt;/h3&gt;&lt;p&gt;Lexos gives you numerous options for designating where document should be cut into segments. The options are detailed below.&lt;/p&gt;&lt;h4&gt;&lt;u&gt;Characters/Segment&lt;/u&gt;&lt;/h4&gt;&lt;p&gt;This option allows you to designate the number of characters you wish to be included in each segment. When the &lt;strong&gt;Characters/Segment&lt;/strong&gt; radio button is clicked, the &lt;strong&gt;Segment Size&lt;/strong&gt;, &lt;strong&gt;Overlap&lt;/strong&gt;, and &lt;strong&gt;Last Segment Size Threshold&lt;/strong&gt; options become visible. &lt;strong&gt;Segment Size&lt;/strong&gt; refers to the number of characters you wish to include in each segment. Lexos will begin a new segment when it reaches the number of characters you designate before starting over at the next segment. &lt;strong&gt;Overlap&lt;/strong&gt; allows you to specify an area of overlap between each segment. For instance, if you choose a segment size of 1000 characters and an overlap of 10 characters. Segment 1 will end at 1000 and Segment 2 will begin at 990. The &lt;strong&gt;Last Segment Size Threshold&lt;/strong&gt; option provides a method of handling circumstances where the final segment does not reach the number of characters in the designated segment size. The default setting is to treat this final segment as a separate segment if it is 50% or more of the length of the designated segment size. If not, the entire final segment will be attached to the previous one. Changing the &lt;strong&gt;Last Segment Size Threshold&lt;/strong&gt; percentage allows you to customize this behavior.&lt;/p&gt;&lt;h4&gt;&lt;u&gt;Lines/Segment&lt;/u&gt;&lt;/h4&gt;&lt;p&gt;If your documents contain line breaks, you may use them to indicate where Lexos performs cutting actions. The &lt;strong&gt;Segment Size&lt;/strong&gt; option allows you to choose the number of lines after which Lexos will perform a cut. All the other options work exactly the same as for the &lt;strong&gt;Characters/Segment&lt;/strong&gt; option, except that they work by counting lines instead of characters.&lt;/p&gt;&lt;h4&gt;&lt;u&gt;Tokens/Segment&lt;/u&gt;&lt;/h4&gt;&lt;p&gt;Lexos can perform cutting actions based on the number of tokens per segment. By default, it treats space-separated strings of characters as tokens, but this behavior can be modified by changing the settings in the &lt;strong&gt;Tokenizer&lt;/strong&gt; tool. This will allow you to use n-grams as your tokens. Apart from using tokens as the unit for measuring segment size, all other options work exactly the same as for the &lt;strong&gt;Characters/Segment&lt;/strong&gt; option.&lt;/p&gt;&lt;h4&gt;&lt;u&gt;Segments/Document&lt;/u&gt;&lt;/h4&gt;&lt;p&gt;This option divides documents into a designated number of evenly-sized segments, regardless of the length of the document. Where the last segment is shorter than the others, Lexos applies a 50% &lt;strong&gt;Last Segment Size Threshold&lt;/strong&gt; percentage as described under &lt;strong&gt;Characters/Segment&lt;/strong&gt; above.&lt;/p&gt;&lt;h4&gt;&lt;u&gt;Cut by Milestone&lt;/u&gt;&lt;/h4&gt;&lt;p&gt;This option allows you to assign a text string occurring in the document to use as a delimiter between segments. Typically, these &amp;ldquo;milestone&amp;rdquo; strings will be placed at appropriate locations in text files before they are uploaded to Lexos. For instance, you might add the string &amp;ldquo;CHAPTER&amp;rdquo; at the beginning of every chapter in a novel and then supply &amp;ldquo;CHAPTER&amp;rdquo; as the milestone term. Lexos will then perform a cut everytime it encounters this term, allowing you to divide your novel into individual documents for each chapter. Note that you must be careful to select a milestone term that does not occur anywhere as part of the text of your documents. Milestones are not counted as terms in the Document-Term Matrix (DTM).&lt;/p&gt;&lt;h3&gt;Cutting your Documents&lt;/h3&gt;&lt;p&gt;Once you have selected the cutting options you desire, click the &lt;strong&gt;Preview Cuts&lt;/strong&gt; button to see the results in the preview window. If you are happy with the cuts performed by Lexos, click the &lt;strong&gt;Apply Cuts&lt;/strong&gt; button. This will create new documents with the same name as the original followed by a number for each segment. Each segment will appear as a new document in the &lt;strong&gt;Manage&lt;/strong&gt; tool. Once cutting is applied, the original document is de-activated and the new segments are made active documents. In addition, once cuts are applied, each segment acquires an &lt;strong&gt;Individual Options&lt;/strong&gt; button in the preview window. Clicking this button opens a version of the cutting options form in the main Cutter tool which allows you to apply cuts to each segment individually.&lt;/p&gt;&lt;p&gt;You can download the new document segments by clicking the &lt;strong&gt;Download Cut Files&lt;/strong&gt; button.&lt;/p&gt;</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <scalar:continue_to_content_id>158924</scalar:continue_to_content_id>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-08-13T18:35:26+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:831957"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/cut"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/cutting">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2015-08-19T01:37:40+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:174492"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/cutting.2"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/cutting.2"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/cutting.2">
    <ov:versionnumber>2</ov:versionnumber>
    <dcterms:title>Cutting</dcterms:title>
    <dcterms:description>The main starting page for Cutting topics</dcterms:description>
    <sioc:content>Cutting topics go here.&lt;br /&gt;&lt;br /&gt;This path has not yet been developed.</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-08-22T17:42:59+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:839398"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/cutting"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/epistemology">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2017-02-27T13:19:08+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:389750"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/epistemology.1"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/epistemology.1"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/epistemology.1">
    <ov:versionnumber>1</ov:versionnumber>
    <dcterms:title>Epistemology</dcterms:title>
    <dcterms:description>The beginning of a thread on interpreting the results of computational text analysis</dcterms:description>
    <sioc:content>This is just the beginning of a thread on interpreting the results of computational text analysis. For now, we&amp;#39;re just posting relevant links.&lt;ul&gt;&lt;li&gt;&lt;a target="_blank" href="https://zentralwerkstatt.github.io/index.html?post=post_vsm_new"&gt;Fabian Offert, &amp;quot;Intuition and Epistemology of High-Dimensional Vector Space 1: Solving is Visualizing.&amp;quot; Zentralwerkstatt (February 22, 2017)&lt;/a&gt;.&lt;/li&gt;&lt;/ul&gt;</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2017-02-27T13:19:08+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:1044728"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/epistemology"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/glossary">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/6902"/>
    <dcterms:created>2015-06-11T10:03:11+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:160761"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/glossary.15"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/glossary.15"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/glossary.15">
    <ov:versionnumber>15</ov:versionnumber>
    <dcterms:title>Glossary</dcterms:title>
    <dcterms:description>Glossary of terms used in Lexos and In the Margins</dcterms:description>
    <sioc:content>&lt;p&gt;This page is intended to provide definitions for the terms used within the Lexos suite, as well as to disambiguate terms drawn from natural language, programming languages, and linguistic analysis. New entries are being added on an ongoing basis.&lt;/p&gt;&lt;p&gt;&lt;a name="agglomerative-hierarchical-clustering"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;Agglomerative Hierarchical Clustering&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;&lt;a name="character"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;Character&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;A character is any individual symbol. The letters that make up the Roman alphabet are characters, as are non-alphabetic symbols such as the Hanzi used in Chinese writing. In Lexos, the term &lt;em&gt;character&lt;/em&gt; generally refers to countable symbols.&lt;/p&gt;&lt;p&gt;&lt;a name="community-detection"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;Community Detection&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;&lt;a name="cosine-similarity"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;Cosine Similarity&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;&lt;a name="cutting"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;Cutting&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;&lt;a name="dendrogram"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;Dendrogram&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;&lt;a name="dimensionality-reduction"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;Dimensionality Reduction&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;&lt;a name="distance-metric"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;Distance Metric&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;&lt;a name="document"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;Document&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;In Lexos, a document is any collection of words (known as terms in Lexos) or characters collected together to form a single item within the Lexos tool. A document is distinct from a file in that the term document refers specifically to the items manipulated within the Lexos software suite, as opposed to file, which refers to the items that are either uploaded from or downloaded to a user&amp;rsquo;s device.&lt;/p&gt;&lt;p&gt;&lt;a name="edit-distance"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;Edit Distance&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;&lt;a name="euclidean-distance"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;Euclidean Distance&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;&lt;a name="exclusive-cluster-analysis"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;Exclusive Cluster Analysis&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;&lt;a name="feature-selection"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;Feature Selection&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;&lt;a name="file"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;File&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;File refers to items that can be manipulated through the file manager on a user&amp;rsquo;s computer i.e. windows explorer, archive manager, etc. File is only used in the Lexos suite when referring to functions that involve the user&amp;rsquo;s file system, such as uploading or downloading.&lt;/p&gt;&lt;p&gt;&lt;a name="flat-cluster-analysis"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;Flat Cluster Analysis&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;&lt;a name="hapax-legomena"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;&lt;em&gt;Lapax Legomena&lt;/em&gt;&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;A term occurring only once in a document or corpus.&lt;/p&gt;&lt;p&gt;&lt;a name="hierarchical-cluster-analysis"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;Hierarchical Cluster Analysis&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;&lt;a name="k-means-clustering"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;K-Means Clustering&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;&lt;a name="lemma"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;Lemma&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;The dictionary headword form of a word. For instance, &amp;ldquo;cat&amp;rdquo; is the lemma for &amp;ldquo;cat&amp;rdquo;, &amp;ldquo;cats&amp;rdquo;, &amp;ldquo;cat&amp;rsquo;s&amp;rdquo;, and &amp;ldquo;cats&amp;rsquo;&amp;rdquo;. Lemmas are generally used to consolidate grammatical variations of the same word as a single term, but they may also be used for spelling variants.&lt;/p&gt;&lt;p&gt;&lt;a name="lexomics"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;Lexomics&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;The term &amp;ldquo;lexomics&amp;rdquo; was originally used to describe the computer-assisted detection of &amp;ldquo;words&amp;rdquo; (short sequences of bases) in genomes,&lt;sup&gt;&lt;a href="http://www.jstor.org/stable/10.1086/668252#fn15"&gt;*&lt;/a&gt;&lt;/sup&gt; but we have extended it to apply to literature, where lexomics is the analysis of the frequency, distribution, and arrangement of words in large-scale patterns. Using statistical methods and computer-based tools to analyze data retrieved from electronic corpora, lexomic analysis allows us to identify patterns of vocabulary use that are too subtle or diffuse to be perceived easily. We then use the results derived from statistical and computer-based analysis to augment traditional literary approaches including close reading, philological analysis, and source study. Lexomics thus combines information processing and analysis with methods developed by medievalists over the past two centuries. We can use traditional methods to identify problems that can be addressed in new ways by lexomics, and we also use the results of lexomic analysis to help us zero in on textual relationships or portions of texts that might not previously have received much attention.&lt;/p&gt;&lt;p&gt;&lt;a name="n-gram"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;N-gram&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;An n-gram is a string of one or more tokens delimited by length. N-grams can be characters or larger tokens (e.g. space-bounded strings typically equivalent to words in Western languages). A one-character n-gram is described as a 1-gram or uni-gram. There are also 2-grams (bi-grams), 3-grams (tri-grams), 4-grams, and 5-grams. Larger n-grams are rarely used. Using n-grams to create a sliding window of characters in a text is one method of counting terms in non-Western languages (or DNA sequences) where spaces or other markers are not used to delimit token boundaries.&lt;/p&gt;&lt;p&gt;&lt;a name="normalization"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;Normalization&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;&lt;a name="overlapping-cluster-analysis"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;Overlapping Cluster Analysis&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;&lt;a name="partitioning-cluster-analysis"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;Partitioning Cluster Analysis&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;&lt;a name="rolling-window-analysis"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;Rolling Window Analysis&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;&lt;a name="scrubbing"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;Scrubbing&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;&lt;a name="segment"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;Segment&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;After cutting a text in Lexos, the separated pieces of the text are referred to as segments. However, segments are treated by Lexos as documents and they may be referred to as documents when the focus is not on their being a part of the entire text.&lt;/p&gt;&lt;p&gt;&lt;a name="similarity"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;Similarity&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;&lt;a name="sparse-matrix"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;Sparse Matrix&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;&lt;a name="standard-deviation"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;Standard Deviation&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;&lt;a name="standard-error-test"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;Standard Error Test&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;&lt;a name="stopword"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;Stopword&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;&lt;a name="supervised-learning"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;Supervised Learning&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;&lt;a name="term"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;Term&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;A term is the unique form of a token. If a &lt;strong&gt;token&lt;/strong&gt; &amp;ldquo;cat&amp;rdquo; occurs two times in a document, the &lt;strong&gt;term&lt;/strong&gt; count for &amp;ldquo;cat&amp;rdquo; is 2. In computational linguistics, terms are sometimes called &amp;ldquo;types&amp;rdquo;, but we avoid this usage for consistency.&lt;/p&gt;&lt;p&gt;&lt;a name="text"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;Text&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;Text is a general term used to refer to the objects studied in lexomics, irrespective of the form. It thus may refer to either a file or documents, but it is typically used to refer to the whole work, rather than smaller segments.&lt;/p&gt;&lt;p&gt;&lt;a name="token"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;Token&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;A token is an individual string of characters that may occur any number of times in a document. Tokens can be characters, words, or n-grams (strings of one or more characters or words).&lt;/p&gt;&lt;p&gt;&lt;a name="tokenization"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;Tokenization&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;The process of dividing a text into &lt;em&gt;tokens&lt;/em&gt;.&lt;/p&gt;&lt;p&gt;&lt;a name="type"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;Type&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;See &lt;strong&gt;term&lt;/strong&gt;.&lt;/p&gt;&lt;p&gt;&lt;a name="unicode"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;Unicode&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;&lt;a name="unsupervised-learning"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;Unsupervised Learning&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;&lt;a name="word"&gt;&lt;/a&gt; &lt;strong&gt;&lt;u&gt;Word&lt;/u&gt;&lt;/strong&gt;&lt;/p&gt;&lt;p&gt;A word is, in many Western languages, a set of characters bounded by whitespace or punctuation marks, where whitespace refers to one or more spaces, tabs, or new-line inserts. However, to avoid ambiguity when dealing with many non-Western languages such as Chinese, where a single Hanzi character can refer to the equivalent of an entire Western word, &lt;em&gt;term&lt;/em&gt; is used throughout throughout the Lexos interface and documentation in place of &lt;em&gt;word&lt;/em&gt;. There are a few exceptions where &amp;ldquo;word&amp;rdquo; is used because it is part of an established phrase, it is less awkward, or because the context refers to the semantic category of words.&lt;/p&gt;</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-08-22T17:44:56+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:839401"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/glossary"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/handling-entities">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2017-01-24T21:00:47+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:377329"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/handling-entities.1"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/handling-entities.1"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/handling-entities.1">
    <ov:versionnumber>1</ov:versionnumber>
    <dcterms:title>Handling Entities</dcterms:title>
    <dcterms:description>Instruction for handling HTML, XML, and SGML Entities</dcterms:description>
    <sioc:content>Texts in HTML, XML, and SGML typically encode special characters with &lt;a name="cke-scalar-empty-anchor" target="_blank" href="https://en.wikipedia.org/wiki/Numeric_character_reference"&gt;numeric character references&lt;/a&gt;. In these markup languages, entities are typically represented using codes beginning with &lt;code&gt;&amp;amp; &lt;/code&gt; and ending with &lt;code&gt;; &lt;/code&gt;. These codes may be in decimal or hexadecimal format. For instance, the letter&amp;nbsp;&lt;em&gt;&amp;AElig;&lt;/em&gt; may be represented a&lt;code&gt; &amp;amp;#198;&lt;/code&gt; (decimal) or&lt;code&gt; &amp;amp;#xC6;&lt;/code&gt;(hexadecimal). Additionally, texts in these formats may use &lt;a href="https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references" target="_blank"&gt;character entity references&lt;/a&gt; such as &lt;code&gt;&amp;amp;AElig;&lt;/code&gt;, which can also be used to encode&amp;nbsp;&lt;em&gt;&amp;AElig;&lt;/em&gt; in HTML. Collectively, these refernces are often referred to as &amp;quot;entities&amp;quot;. Web browsers will automatically display the single-character equivalents of these entities if they are part of the HTML standard and/or are available in the display font.&lt;br /&gt;&lt;br /&gt;By default, the Lexos scrubbing tool leaves character entities alone, but this can lead to unexpected behaviors in combination with the &lt;strong&gt;Remove All Punctuation&lt;/strong&gt; option. When that option is applied, an entity like &lt;code&gt;&amp;amp;AElig;&lt;/code&gt; will become &lt;code&gt;AElig&lt;/code&gt; and may end up looking just like a word to Lexos&amp;#39; counting functions. HTML and XML texts are particularly likely to contain entities like &lt;code&gt;&amp;amp;amp; &lt;/code&gt;for &amp;quot;&amp;amp;&amp;quot; or &lt;code&gt;&amp;amp;quot;&lt;/code&gt; for curly quotation marks.&lt;br /&gt;&lt;br /&gt;If you wish to preserve these entities and still remove punctuation marks, you must convert them to their single character &lt;a href="https://en.wikipedia.org/wiki/Unicode" target="_blank"&gt;Unicode&lt;/a&gt; equivalents first. Lexos allows you to do this with the &lt;strong&gt;Special Characters&lt;/strong&gt; option. This replaces entities before punctuation marks are stripped, making it safe to remove punctuation.</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2017-01-24T21:00:47+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:1013550"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/handling-entities"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/hierarchical">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-08-19T17:41:21+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:318944"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/hierarchical.3"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/hierarchical.3"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/hierarchical.3">
    <ov:versionnumber>3</ov:versionnumber>
    <dcterms:title>The Hierarchical Clustering Tool</dcterms:title>
    <dcterms:description>Manual page for the Lexos Hierarchical Clustering tool</dcterms:description>
    <sioc:content>&lt;p&gt;The Lexos &lt;strong&gt;Hierarchical Clustering&lt;/strong&gt; tool performs hierarchical agglomerative cluster analysis on your active documents and produces a visualization of this analysis in the form of a dendrogram (tree diagram). The most important options are the &lt;strong&gt;Distance Metric&lt;/strong&gt; (method of measuring the distance between documents) and &lt;strong&gt;Linkage Method&lt;/strong&gt; (method of determining when documents will be attached to a cluster) dropdown menus. Lexos uses Euclidean distance and average linkage as defaults. For further details about how to choose a distance metric and linkage method, see the topics discussion on &lt;a href="http://scalar.usc.edu/works/lexos/hierarchical-clustering" target="_blank"&gt;Hierarchical Clustering&lt;/a&gt;&lt;a href="hierarchical-clustering"&gt;&lt;/a&gt;.&lt;/p&gt;&lt;p&gt;The remaining options allow you to configure the appearance of the dendrogram. You may supply a &lt;strong&gt;Dendrogram Title&lt;/strong&gt;, which will be displayed at the top of the graph and select the &lt;strong&gt;Dendrogram Orientation&lt;/strong&gt; (vertical or horizontal). In our experience, vertically-oriented dendrograms are easier to interpret. However, when they have many leaves, the labels tend to overlap and become unreadable. Horizontal dendrograms may produce slightly better results. Another approach is to limit the &lt;strong&gt;Number of Leaves&lt;/strong&gt; displayed in the dendrogram. Reducing this number will collapse the most closely related clusters (those lower down on the dendrogram), showing only the larger groups. A numbered label in parentheses will show how many leaves have been collapsed into single branch. See below for other strategies for producing more readable dendrograms.&lt;/p&gt;&lt;p&gt;The &lt;strong&gt;Show Branch Height in Dendrogram&lt;/strong&gt; option will place red nodes at the top of each clade labelled with the height (length) of the clade branches from the leaf node. See the &lt;a resource="how-to-read-a-dendrogram" data-annotations="" data-caption="description" data-align="right" data-size="small" href="https://www.youtube.com/watch?v=MX6AUX1b1w0"&gt;How to Read a Dendrogram&lt;/a&gt; video for the interpretation of branch height. The &lt;strong&gt;Show Legends in Dendrogram&lt;/strong&gt; will add to the dendrogram image a series of annotations showing the options you have selected.&lt;/p&gt;&lt;p&gt;All of the &lt;a href="advanced-options"&gt;Advanced Options&lt;/a&gt; for manipulating the Document-Term Matrix (DTM) are available in the &lt;strong&gt;Hierarchical Clustering&lt;/strong&gt; tool. There are also options for generating a &lt;em&gt;Silhouette Score&lt;/em&gt;,&amp;nbsp; measure of determining cluster robustness. &lt;strong&gt;Silhouette Score Options&lt;/strong&gt; are discussed below.&lt;/p&gt;&lt;p&gt;&lt;strong&gt;Important&lt;/strong&gt;: Due to a limitation in the &lt;a target="_blank" href="http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.dendrogram.html"&gt;scipy clustering package&lt;/a&gt; employed by Lexos to plot dendrograms, leaf labels containing non-Roman or other special characters will most likely appear as question marks. If this is the case, we recommend using the &lt;a href="advanced-options"&gt;Advanced Options&lt;/a&gt; &lt;strong&gt;Temporary Labels&lt;/strong&gt; function to ensure that your leaf labels clearly identify your documents. We hope to address this limitation in future versions of Lexos.&lt;/p&gt;&lt;p&gt;Once you have selected your options, click the &lt;strong&gt;Get Dendrogram&lt;/strong&gt; button. After the dendrogram appears, you can click on it top open it in a new window.&lt;/p&gt;&lt;h3&gt;Silhouette Scores&lt;/h3&gt;&lt;p&gt;Silhouette scores give a general indication of how well individual objects lie within their cluster and are thus one method of &lt;a href="establishing-robust-clusters"&gt;measuring cluster robustness&lt;/a&gt;. A score of 1 indicates tight, distinct clusters. Values near 0 indicate overlapping clusters. Negative values generally indicate that a sample has been assigned to the wrong cluster, as a different cluster is more similar.&lt;/p&gt;&lt;p&gt;To generate a silhouette score for your dendrogram, click on the &lt;strong&gt;Silhouette Score Options&lt;/strong&gt; menu. You may set the &lt;strong&gt;Maximum Number of Clusters&lt;/strong&gt; to between 2 and the number of active documents in our session. After setting this number, click the green &lt;strong&gt;Get Dendrogram&lt;/strong&gt; button, and the silhouette score will appear above the button. Further information can be found in the topics article on &lt;a href="silhouette-scores"&gt;Silhouette Scores&lt;/a&gt;.&lt;/p&gt;&lt;h3&gt;Downloading Dendrograms&lt;/h3&gt;&lt;p&gt;Lexos allows you to download dendrogram images in a number of formats (PDF, PNG, and SVG). To download dendrogram image, click the appropriate button on the right side of the screen.&lt;/p&gt;&lt;p&gt;Lexos uses the &lt;a target="_blank" href="http://docs.scipy.org/doc/scipy/reference/genera/scipy.cluster.hierarchy.dendrogram.html"&gt;scipy clustering package&lt;/a&gt; to plot dendrograms, and this has some severe limitations in the type of output available. There are many other tools available which allow you to explore and manipulate dendrograms once you have done your cluster analysis. These tools typically allow you to import pre-existing dendrogram (tree) structure in &lt;a target="_blank" href="https://en.wikipedia.org/wiki/Newick_format"&gt;Newick format&lt;/a&gt;: a text file representing the hierarchical structure using parentheses and commas. Lexos also provides &lt;strong&gt;Newick&lt;/strong&gt; download button which will convert your dendrogram&amp;#39;s structure to a text file in Newick format. You can then upload this file in external tools. Note, however, that many external dendrogram plotting tools do not seem to preserve branch height.&lt;/p&gt;</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2017-01-05T13:11:05+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:999384"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/hierarchical"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/how-to-read-a-dendrogram">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Media"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2015-06-16T08:55:01+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:161357"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/how-to-read-a-dendrogram.2"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/how-to-read-a-dendrogram.2"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/how-to-read-a-dendrogram.2">
    <ov:versionnumber>2</ov:versionnumber>
    <dcterms:title>How to Read a Dendrogram</dcterms:title>
    <dcterms:description>YouTube video tutorial of how to read a dendrogram</dcterms:description>
    <art:url rdf:resource="https://www.youtube.com/watch?v=MX6AUX1b1w0"/>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2015-06-16T08:56:32+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:402159"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/how-to-read-a-dendrogram"/>
    <dcterms:isReferencedBy rdf:resource="http://scalar.usc.edu/works/lexos/hierarchical-clustering"/>
    <dcterms:isReferencedBy rdf:resource="http://scalar.usc.edu/works/lexos/hierarchical"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/how-to-run-lexos">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-08-23T10:42:30+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:319255"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/how-to-run-lexos.2"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/how-to-run-lexos.2"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/how-to-run-lexos.2">
    <ov:versionnumber>2</ov:versionnumber>
    <dcterms:title>How to Run Lexos</dcterms:title>
    <dcterms:description>Instructions for using Lexos online or on localhost</dcterms:description>
    <sioc:content>&lt;p&gt;Lexos is a web-based tool designed for transforming, analyzing, and visualizing texts. Lexos is designed for use primarily with small to medium-sized text collections, and especially for use with ancient languages and languages that do not employ the Latin alphabet. Lexos was created as an entry-level platform for Humanities scholars and students new to computational techniques while providing tools and techniques sophisticated enough for advanced research.&lt;/p&gt;&lt;p&gt;Lexos runs through your web browser and currently, Lexos supports Google Chrome and Mozilla Firefox; other browsers may not function properly. You my choose of the following methods of running Lexos:&lt;/p&gt;&lt;ol&gt;&lt;li&gt;Use the online installation hosted by the Lexomics project at &lt;a target="_blank" href="http://lexos.wheatoncollege.edu/"&gt;http://lexos.wheatoncollege.edu/&lt;/a&gt;. This is very convenient, but you may suffer uploading or processing delays based on fluctuations in internet speed.&lt;/li&gt;&lt;li&gt;&lt;a target="_blank" href="http://wheatoncollege.edu/lexomics/lexos-installers/"&gt;Download and Install Lexos&lt;/a&gt; using one of the methods provided on the Lexomics website (either use an auto-installer, follow manual instructions, or clone the GitHub repository). This method requires you to install the Python programming language on your computer. Lexos runs in a &amp;quot;localhost&amp;quot; web server on your machine, which may be faster than communicating with the Lexomics server. Running Lexos on your computer also provides the option to to use &amp;quot;local mode&amp;quot;, which does not require internet access (see below).&lt;/li&gt;&lt;/ol&gt;&lt;p&gt;Both methods have their advantages and disadvantages. If you are a beginner, we suggest that you get to know Lexos using the online version. Later, you can download Lexos and run it locally for greater speed.&lt;/p&gt;&lt;h3&gt;Using Local Mode&lt;/h3&gt;&lt;p&gt;Many functions in Lexos are based on common Javascript libraries like jQuery and Twitter Bootstrap, which are employed all over the internet. So, chances are that your browser has cached these libraries already and doesn&amp;#39;t need to load them, which makes loading times much faster. But we can&amp;#39;t rely on it. So, even if you are running Lexos on your own computer using localhost, Lexos still requires an active internet connection to download these Javascript libraries. Most of the time, this is not an issue.&lt;/p&gt;&lt;p&gt;But what if you don&amp;#39;t have an internet connection? You can still run Lexos locally on your computer. Lexos has all the Javascript libraries built in and will switch to them if you put it in &amp;quot;local mode&amp;quot;. All you have to do is find the Lexos folder on your computer and open the file &lt;code&gt;config.cfg&lt;/code&gt; in a text editor. Change &lt;code&gt;LOCAL_MODE = False&lt;/code&gt; to &lt;code&gt;LOCAL_MODE = True&lt;/code&gt; (be careful, it is case sensitive); then save the file. You can ignore the other settings. If you are already running Lexos, quit form it by typing &lt;code&gt;Control+C&lt;/code&gt; on the command line and then restart it by typing &lt;code&gt;python lexos.py&lt;/code&gt;. (See the &lt;a target="_blank" href="http://wheatoncollege.edu/lexomics/lexos-installers/"&gt;Manual Installation instructions&lt;/a&gt; on the Lexomics website if you need help with this.) You will now be running in local mode.&lt;/p&gt;</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-08-23T15:57:03+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:839689"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/how-to-run-lexos"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/index">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <scalar:banner>media/BeoEthThorn1000WordAve.JPG</scalar:banner>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/6902"/>
    <dcterms:created>2015-06-02T08:03:42+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:158930"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/index.62"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/index.62"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/index.62">
    <ov:versionnumber>62</ov:versionnumber>
    <dcterms:title>Welcome</dcterms:title>
    <dcterms:description>The In the Margins home page</dcterms:description>
    <sioc:content>&lt;p&gt;&lt;em&gt;In the Margins&lt;/em&gt; is a &lt;a href="http://scalar.usc.edu/"&gt;Scalar&lt;/a&gt; book which serves as a companion for Lexomic research and the &lt;a target="_blank" href="http://lexos.wheatoncollege.edu"&gt;Lexos&lt;/a&gt; literary text analysis software. The online version of the Lexos software is available at &lt;a href="http://lexos.wheatoncollege.edu/upload"&gt;http://lexos.wheatoncollege.edu&lt;/a&gt;. Our passions for tool-building have intersected with our interest in two questions:&lt;/p&gt;&lt;blockquote&gt;&lt;em&gt;How can we explore the growing impact that quantitative and algorithmic approaches are having on the Humanities?&lt;/em&gt;&lt;/blockquote&gt;&lt;blockquote&gt;&lt;em&gt;How can we make the discussion part of the tool and the tool part of the discussion​?&lt;/em&gt;&lt;/blockquote&gt;&lt;p&gt;&lt;em&gt;Lexomics&lt;/em&gt; is our name for certain methods of stylistic analysis (sometimes called stylometry). This type of analysis harnesses the power of modern computing and statistical techniques to investigate Humanities-based questions such as authorship attribution or textual lineage. Lexomic methods complement traditional Humanities methods of literary interpretation, rather than replacing these challenges. We note that our small but spirited team exists within a much larger community of scholars who continue to influence our team greatly (&lt;em&gt;cf.&lt;/em&gt; Eder, Craig, Jockers, Hoover, Liu, Sinclair and Rockwell, &lt;em&gt;et al.&lt;/em&gt;).&lt;/p&gt;&lt;p&gt;The role of Lexos is to help readers of literature identify and explore patterns in texts, thereby opening up new questions and new avenues of research. Lexos provides an integrated workflow of pre-processing, analytical, and visualization tools which allow students and scholars of literature to detect and explore patterns in their texts. &lt;a href="http://lexos.wheatoncollege.edu"&gt;Lexos&lt;/a&gt; is freely available for use online (perhaps the best choice for first and occassional users) and it may also be downloaded and installed locally for better performance (installation instructions are available &lt;a target="_blank" href="https://github.com/WheatonCS/Lexos/tree/master/0_InstallGuides"&gt;here&lt;/a&gt;).&lt;/p&gt;&lt;p&gt;The aim of Lexos is to create an entry-level environment for Lexomic scholarship, one simple enough to be used easily by the casual student but powerful enough for the advanced professor to use in creating new knowledge and insight. Lexos was created for use with small to medium-sized collections of texts (rather than large text corpora or &amp;quot;big data&amp;quot;), and for use with languages that have non-standard or non Latin-based spelling systems. Most of the early Lexomic research was done on medieval English texts. Doing statistical analysis on texts of these types creates certain challenges, both theoretical and practical, and Lexos developed as a way to explore them.&lt;/p&gt;&lt;p&gt;These issues form part of a wider set of questions we can ask about how computational tools can be used in the Humanities: where are the opportunities, what are the effective practices, and what are the limitations? These questions are not new with us of course, and the wider field is too large to cite here, but &lt;em&gt;In the Margins&lt;/em&gt; is our effort to bring the choice of and discussion about methodological decisions to the fore. Our companion documentation, &lt;em&gt;In the Margins,&lt;/em&gt; exists not only as a &amp;quot;how to&amp;quot; guide for using Lexos but also as a means to elicit community commentary of effective practices when making the many decisions during the workflow (e.g., how to handle punctuation, count words, and select metrics). &lt;em&gt;In the Margins&lt;/em&gt; can be explored directly from its &lt;a href="http://scalar.usc.edu/works/lexos/index"&gt;Scalar website&lt;/a&gt;, but we also make use of Scalar&amp;#39;s Application Programming Interface (API) to embed &lt;em&gt;In the Margins&lt;/em&gt; content directly in Lexos. We think it is important that Lexos not become a &amp;quot;black box&amp;quot; into which users feed their texts and from which they obtain results uncritically. By making the discussion part of the tool and the tool part of the discussion, we aim to make Lexos a more rigorous and powerful tool, one in which we can explore more generally the growing impact that quantitative and algorithmic approaches are having on the Humanities.&lt;/p&gt;</sioc:content>
    <scalar:defaultView>book_splash</scalar:defaultView>
    <scalar:continue_to_content_id>173671</scalar:continue_to_content_id>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2017-07-07T16:24:44+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:1257313"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/index"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="urn:scalar:path:1257313:1257324:1">
    <scalar:urn rdf:resource="urn:scalar:path:1257313:1257324:1"/>
    <oac:hasBody rdf:resource="http://scalar.usc.edu/works/lexos/index.62"/>
    <oac:hasTarget rdf:resource="http://scalar.usc.edu/works/lexos/learn-more.11#index=1"/>
    <rdf:type rdf:resource="http://www.openannotation.org/ns/Annotation"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/learn-more">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2015-08-16T16:06:26+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:173700"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/learn-more.11"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/learn-more.11"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/learn-more.11">
    <ov:versionnumber>11</ov:versionnumber>
    <dcterms:title>Learn More about In the Margins</dcterms:title>
    <sioc:content>&lt;p&gt;&lt;i&gt;In the Margins&lt;/i&gt; is the Lexomics Research Group&amp;rsquo;s attempt to position the process of computational&lt;i&gt; &lt;/i&gt;literary text analysis side by side with its product, whether it be the tool used for or the results obtained from such analysis. This is particularly important for entry-level users and those whose training has not explored the issues raised by computational methods of studying literature. Our text analysis tool, Lexos, is designed for use by newcomers to the field while empowering them to do sophisticated work in relatively little time. But with power comes a price--it must be employed critically. Too often text analysis tools elide aspects of the text analysis process, drawing attention away from the many steps and decisions required both before and after the use of the tool which can impact the results. Documentation tends to focus on how to use the software, rather than how or why it would be used in specific circumstances. Discussion of this sort may exist in other forums, but the separation between the discussion and the tool tends to make the latter function as a &amp;ldquo;black box&amp;rdquo;. This can ultimately feed tensions between theoretical traditions prevalent in the Humanities and the use quantitative methods that often have their origins in other disciplines. &lt;i&gt;In the Margins&lt;/i&gt; answers Johanna Drucker&amp;rsquo;s call for Digital Humanities to &amp;ldquo;synthesize method and theory into ways of doing as thinking&amp;rdquo; by designing tools that embody humanists&amp;rsquo; value of &amp;ldquo;debate, commentary, and interpretive exposition&amp;rdquo; (2012).&lt;/p&gt;&lt;p&gt;A central feature of our approach is the creation of a seamless transition between the tool, the documentation, and the discussion. &lt;i&gt;In the Margins&lt;/i&gt; contains both instructions for how to use Lexos and discussion about why particular steps or decisions might be taken in the analytical process. This content is then embedded within Lexos user interface so that the user is always aware of the need for reflection about the process. Although &lt;i&gt;In the Margins&lt;/i&gt; can be explored directly in the Scalar publishing platform, much of its content is also accessible from within Lexos, &lt;i&gt;in situ&lt;/i&gt;, so that the user is more easily able to find information about the implications and best practices for any given function and to reflect upon these issues as part of his or her process. &lt;i&gt;In the Margins&lt;/i&gt; embraces the design challenge of providing text, expert commentary, and screen-demos from within the Lexos workflow in order to offer commentary as close to the user&amp;rsquo;s current task as possible. This commentary comes from the Lexomics Research Group and an array of outside exports, and we hope that the content will grow over time. If you are interested in providing content for &lt;i&gt;In the Margins&lt;/i&gt;, please contact us.&lt;/p&gt;&lt;p&gt;The use of the Scalar publishing platform allows us to make &lt;i&gt;In the Margins&lt;/i&gt; content available both within Lexos (using Scalar&amp;rsquo;s API) and separately on the web for use as a resource by those who may be using other tools or approaches. Scalar organizes content into &amp;ldquo;paths&amp;rdquo;, which are like chapters of a book, except that individual pages can appear in multiple paths and paths can fork into other paths. Scalar provides methods of visualizing this structure to allow user to navigate the paths. In addition to the current path, &lt;i&gt;In the Margins&lt;/i&gt; provides a path about Lexomics and a path about Lexos. Most pages can be accessed from one of these paths, but a few, mostly those focusing specifically on providing instructions for using the Lexos tool, are accessible only from within Lexos.&lt;/p&gt;</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2017-07-07T16:31:05+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:1257324"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/learn-more"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/interface">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-08-15T14:58:12+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:314525"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/interface.2"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/interface.2"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/interface.2">
    <ov:versionnumber>2</ov:versionnumber>
    <dcterms:title>The Lexos Interface</dcterms:title>
    <dcterms:description>Manual page for the Lexos Interface</dcterms:description>
    <sioc:content>The Lexos interface is designed to be simple to use, to emphasize the Lexomics workflow, and to make the many decisions required in performing computational text analysis as transparent as possible. As of version 3.0, it consists of 14 tools, all of which are accessible from the navigation menu at the top of the screen. The banner identifies which tool you are in through the use of curly braces, e.g. &amp;quot;Lexos{Scrubber}&amp;quot;. Each tool is part of a component of the workflow, and the current component is highlighted in light blue in the menu.&lt;br /&gt;&lt;br /&gt;When you start Lexos in your web browser, a session folder is created to contain all your files and settings. This is known as the Lexos &lt;strong&gt;workspace&lt;/strong&gt;. You may save your workspace at any time by clicking the &lt;strong&gt;Workspace&lt;/strong&gt; button at the top of the banner. Uploading this file in the &lt;strong&gt;Upload&lt;/strong&gt; tool will restore all your files and settings from their state when you downloaded the workspace.&lt;br /&gt;&lt;br /&gt;Note: If you are using the online version of Lexos at &lt;a target="_blank" href="http://lexos.wheatoncollege.edu/"&gt;http://lexos.wheatoncollege.edu/&lt;/a&gt;, your session folder may be stored on the server for up to a month. If you leave and return to Lexos, you may find that your last workspace pops up automatically. But we don&amp;#39;t recommend that you rely on this.&lt;br /&gt;&lt;br /&gt;The &lt;strong&gt;Reset&lt;/strong&gt; button will destroy your current session, start a new one, and redirect you to the &lt;strong&gt;Upload&lt;/strong&gt; tool. If you ever encounter an error, you may find that the functionality of Lexos can be restored by clicking the &lt;strong&gt;Reset&lt;/strong&gt; button or by replacing &lt;code&gt;/upload&lt;/code&gt;, &lt;code&gt;/manage&lt;/code&gt;, or whatever tool is at the end of the url in the browser with &lt;code&gt;/reset&lt;/code&gt;.&lt;br /&gt;&lt;br /&gt;The &lt;strong&gt;Gear&lt;/strong&gt; button in the top right corner of the interface opens a dialog with a message about Lexos. You can also click the &lt;strong&gt;Use Beta functions&lt;/strong&gt; checkbox to enable Lexos&amp;#39; Beta functions. These are new tools that are not yet fully tested. By default, they are hidden, but they will become visible if you select this option. Use Beta functions with caution, as they are not yet considered stable.&lt;br /&gt;&lt;br /&gt;Beneath the Lexos banner is the menu bar, which is organized to emphasize the &lt;a href="http://scalar.usc.edu/works/lexos/lexos"&gt;Lexomics workflow&lt;/a&gt;. On the right side of the banner Lexos displays a folder icon if you have active documents. Mousing over the icon will display a tooltip showing the number of active documents. Clicking on it will open the Lexos &lt;strong&gt;Manage&lt;/strong&gt; tool.&lt;h3&gt;The &lt;em&gt;In the Margins&lt;/em&gt; Panel&lt;/h3&gt;The &lt;em&gt;In the Margins&lt;/em&gt; Panel can be accessed from all tools in Lexos by clicking the the small tab on the left edge of the screen. Clicking the tab again will close the panel. The &lt;em&gt;In the Margins&lt;/em&gt; Panel contains the text of the Lexos Manual page for the tool currently in use. Click on the title link to open the page in a new window. This will give you access to the entire &lt;em&gt;In the Margins&lt;/em&gt; website.&lt;h3&gt;Feedback and Support&lt;/h3&gt;If you have questions or suggestions, click the &lt;strong&gt;Feedback and Support&lt;/strong&gt; link at the bottom of the screen. We also welcome bug reports on our &lt;a target="_blank" href="https://github.com/WheatonCS/Lexos/issues"&gt;GitHub site&lt;/a&gt;.&lt;h3&gt;Language and Terminology&lt;/h3&gt;Lexos has been designed using the insights of many different disciplines which often use different language for the same or similar concepts. In choosing terminology to label functions in the interface, we have attempted to walk a tightrope between familiar language, jargon, and language that might be inaccurate some users. Perhaps the most noticeable example is the use of &amp;quot;word&amp;quot;&amp;mdash;a very slippery concept indeed. Computational approaches to textual analysis can only work with countable units, and it is not always easy to identify what constitutes a &amp;quot;word&amp;quot;. In Western written languages, words are often designated by delimiters such as spaces and punctuation marks, but this does not apply to all languages. In order to be as neutral as possible, we adopt usage common in computational linguistics and machine learning. We refer to countable units as &amp;quot;tokens&amp;quot; and their unique forms as &amp;quot;terms&amp;quot;. This usage may at first feel unfamiliar to many humanities students and scholars, but we believe that it is preferable to avoid the problematic use of &amp;quot;word&amp;quot;. On the other hand, for some tools concepts &amp;quot;word clouds&amp;quot;, where &amp;quot;word&amp;quot; is well-established or otherwise useful, we have retained it. In this case, it should taken to be synonymous with &amp;quot;term&amp;quot;.&lt;br /&gt;&lt;br /&gt;Another usage we adopt from machine learning is the generic term &amp;quot;document&amp;quot; to refer to any type of text. In many disciplines, &amp;quot;documents&amp;quot; refers to particular types of &amp;quot;non-literary&amp;quot; text such as laws, treatises, invoices, and other types of records designed primarily without an aesthetic purpose in mind. Such a distinction is arguably an intellectual construct, but from a computational point of view there is no difference between a law and a lyric. Both consist of list, or vectors, of countable tokens. Furthermore, if you cut them into smaller segments, you are left--again, from a computational point of view&amp;mdash;with smaller vectors just like the originals. Hence it is appropriate to use the same term, &amp;quot;document&amp;quot; for both the whole text and segments of the text. In practice, this means that we adopt a variety of terms. On your computer, your texts are stored in &amp;quot;files&amp;quot;. When you upload them to Lexos, they become &amp;quot;documents&amp;quot; in the Lexos workspace. You may use Lexos to manipulates any documents in the workspace, whether they consist of the whole text or segments derived from them. We sometimes use &amp;quot;text&amp;quot; when we need a term that refers to the object of study and &amp;quot;segments&amp;quot; when we are referring specifically to slices of larger documents.&lt;br /&gt;&lt;br /&gt;If you ever get stuck with the terminology employed in Lexos, &lt;em&gt;In the Margins&lt;/em&gt; has a full &lt;a href="glossary"&gt;Glossary&lt;/a&gt;.</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-08-24T09:49:41+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:839906"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/interface"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/kmeans">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-08-15T17:42:48+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:314549"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/kmeans.4"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/kmeans.4"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/kmeans.4">
    <ov:versionnumber>4</ov:versionnumber>
    <dcterms:title>The K-Means Clustering Tool</dcterms:title>
    <dcterms:description>Manual page for the Lexos K-Means Clustering tool</dcterms:description>
    <sioc:content>&lt;p&gt;The Lexos &lt;strong&gt;K-Means Clustering&lt;/strong&gt; tool partitions your active documents into flat clusters in a way that minimizes the variation within the clusters. It produces a scatterplot graph in which you can visualize the distance between documents or clusters. The &amp;quot;K&amp;quot; in &amp;quot;K-Means&amp;quot; refers to the number of partitions. For instance, if you wish to cluster your documents into three groups, you would set &lt;code&gt;K=3&lt;/code&gt;. The default is the number of active documents, but you will probably want to set this to a smaller number. There is no obvious way to choose the number of clusters. It can be helpful to perform hierarchical clustering before performing K-Means clustering, as the resulting dendrogram may suggest a certain number of clusters that is likely to produce meaningful results. The K-means procedure is very sensitive to the position of the initial seeds, although employing the &lt;strong&gt;K-means++&lt;/strong&gt; setting can help to constrain this placement.&lt;/p&gt;&lt;p&gt;Lexos provides two methods of visualizing K-means cluster analyses. The default &lt;strong&gt;Voronoi Cells&lt;/strong&gt; identifies a centroid (central point) in each cluster and draws a trapezoidal polygon around it. This is helpful in allowing you to see which points fall into which cluster. Select &lt;strong&gt;PCA&lt;/strong&gt; in the &lt;strong&gt;Method of Visualization&lt;/strong&gt; dropdown to view the graph as a &lt;em&gt;&lt;a target="_blank" href="https://en.wikipedia.org/wiki/Principal_component_analysis"&gt;Principal Component Analysis&lt;/a&gt;&lt;/em&gt;, where dots on the plane are colored to mark their cluster membership. Both visualization approaches can you judge distances between clusters.&lt;/p&gt;&lt;h3&gt;Generating and Reading a K-Means Cluster Analysis&lt;/h3&gt;&lt;p&gt;Simply click the &lt;strong&gt;Get K-Means&lt;/strong&gt; button to perform a K-means cluster analysis. If you wish, you can modify the default settings using the &lt;strong&gt;Advanced K-Means Options&lt;/strong&gt; and &lt;strong&gt;Silhouette Score Options&lt;/strong&gt; menus described below.&lt;/p&gt;&lt;p&gt;K-Means cluster analyses can contain a lot of points that are very close together, making the graph difficult to read. In order to aid the process, Lexos provides a table to the left of the graph which displays your documents and color codes them to indicate which cluster they belong to. The same colors are used in the graph. In the Voronoi cell graph, you can move your mouse cursor over the document in the table or a point on the graph to reveal a tooltip label showing the document&amp;#39;s name.&lt;/p&gt;&lt;h4&gt;Advanced K-Means Options&lt;/h4&gt;&lt;p&gt;Since cluster membership is adjusted at each stage of the process by the re-location of the centroids, the number of iterations required and other factors can be adjusted to select a cutoff point for the algorithm or a desired threshold for convergence of different clusters. These adjustments are handled by the &lt;strong&gt;Advanced K-Means Options&lt;/strong&gt;: &lt;strong&gt;Maximum Number of Iterations&lt;/strong&gt;, &lt;strong&gt;Method of Initialization&lt;/strong&gt;, &lt;strong&gt;Number of Iterations with Different Centroids&lt;/strong&gt;, and &lt;strong&gt;Relative Tolerance&lt;/strong&gt;. As with the initial choice of cluster numbers, there are no hard and fast rules for how these factors should be applied. The default settings should serve most users&amp;#39; purposes. However, here are some brief descriptions of the purposes of each option:&lt;/p&gt;&lt;p&gt;&lt;u&gt;Maximum number of iterations:&lt;/u&gt; The K-means algorithm will continue to re-compute centroids for each cluster until all documents settle down into &amp;quot;final&amp;quot; clusters. It is possible that a situation occurs where a document continues to toggle back and forth between two clusters. Setting this value avoids an endless, or at least an unnecessary number of repetitions of the algorithm with little change.&lt;/p&gt;&lt;p&gt;&lt;u&gt;Method of Initialization:&lt;/u&gt; Your results of using K-means on a collection of documents can vary significantly depending on the initial choice of centroids. In Lexos the user is offered two choices: &lt;strong&gt;K-Means++&lt;/strong&gt; and &lt;strong&gt;Random&lt;/strong&gt;. When using the default &lt;strong&gt;K-Means++&lt;/strong&gt; setting, Lexos chooses the first of the K clusters at random (typically by picking any one of the documents in the starting set as representative of a center of a future cluster). The remaining (K-1) cluster centers are then chosen from the remaining documents by computing a probability proportional to the distances of the centers already chosen. Once all centroids are chosen, normal K-Means clustering takes place. The &lt;strong&gt;Random&lt;/strong&gt; setting employs a &amp;quot;random seed&amp;quot; approach in which the locations of &lt;em&gt;all&lt;/em&gt; centroids at the initial stage are generated randomly. It is best to experiment multiple times with different random seeds.&lt;/p&gt;&lt;p&gt;&lt;u&gt;Number of Iterations with Different Centroids:&lt;/u&gt; Documentation of this feature is not yet available.&lt;/p&gt;&lt;p&gt;&lt;u&gt;Relative Tolerance:&lt;/u&gt; Documentation of this feature is not yet available.&lt;/p&gt;&lt;h4&gt;Silhouette Score Options&lt;/h4&gt;&lt;p&gt;Silhouette scores give a general indication of how well individual objects lie within their cluster and are thus one method of &lt;a href="establishing-robust-clusters"&gt;measuring cluster robustness&lt;/a&gt;. A score of 1 indicates tight, distinct clusters. Values near 0 indicate overlapping clusters. Negative values generally indicate that a sample has been assigned to the wrong cluster, as a different cluster is more similar&lt;/p&gt;&lt;p&gt;To generate a silhouette score for your dendrogram, click on the &lt;strong&gt;Silhouette Score Options&lt;/strong&gt; menu. The only option is to change the &lt;strong&gt;Distance Metric&lt;/strong&gt; used for measuring the distance between points. For further information, see &lt;a href="choosing-a-distance-metric"&gt;Choosing a Distance Metric&lt;/a&gt;. Once you have selected a distance metric, click the &lt;strong&gt;Get K-Means&lt;/strong&gt; button and the silhouette score will appear below the button when the process is complete.&lt;/p&gt;&lt;h3&gt;Downloading K-Means Graphs&lt;/h3&gt;&lt;p&gt;There is currently no method for downloading Voronoi graphs, and we recommend taking screen shots. For PCA graphs, you can right-click and use your browser&amp;#39;s &lt;strong&gt;Save image as...&lt;/strong&gt; function. We recommend clicking the &lt;strong&gt;Enlarge Graph&lt;/strong&gt; button to open the image in a new window.&lt;/p&gt;</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-08-22T16:38:02+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:839366"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/kmeans"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/lemmas">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/6902"/>
    <dcterms:created>2015-06-02T08:30:19+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:158933"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/lemmas.2"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/lemmas.2"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/lemmas.2">
    <ov:versionnumber>2</ov:versionnumber>
    <dcterms:title>Lemmas</dcterms:title>
    <sioc:content>The Lemmas option allows you to replace different words throughout the selection with a single new word. This is most often used to disambiguate varied spellings of a given word, such as in the case of kyng, cyng, and king. Using the Lemmas option, you could simply input a list in the form 'kyng, cyng: king' to replace every 'kyng' and 'cyng'&amp;nbsp; in the text with 'king'. Hopefully a bloodless coup.</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/6902"/>
    <dcterms:created>2015-06-02T09:06:32+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:394032"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/lemmas"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/lexomics">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2015-08-16T00:11:22+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:173671"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/lexomics.2"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/lexomics.2"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/lexomics.2">
    <ov:versionnumber>2</ov:versionnumber>
    <dcterms:title>Lexomics</dcterms:title>
    <dcterms:description>The starting point for the Lexomics path</dcterms:description>
    <sioc:content>&lt;span&gt;The term &amp;ldquo;lexomics&amp;rdquo; was originally used to describe the computer-assisted detection of &amp;ldquo;words&amp;rdquo; (short sequences of bases) in genomes,&lt;sup&gt;&lt;a href="http://www.jstor.org/stable/10.1086/668252#fn15"&gt;*&lt;/a&gt;&lt;/sup&gt; but we have extended it to apply to literature, where lexomics is the analysis of the frequency, distribution, and arrangement of words in large-scale patterns. Using statistical methods and computer-based tools to analyze data retrieved from electronic corpora, lexomic analysis allows us to identify patterns of vocabulary use that are too subtle or diffuse to be perceived easily. We then use the results derived from statistical and computer-based analysis to augment traditional literary approaches including close reading, philological analysis, and source study. Lexomics thus combines information processing and analysis with methods developed by medievalists over the past two centuries. We can use traditional methods to identify problems that can be addressed in new ways by lexomics, and we also use the results of lexomic analysis to help us zero in on textual relationships or portions of texts that might not previously have received much attention.&lt;br /&gt;&lt;br /&gt;More information can be found on the &lt;a target="_blank" href="http://lexomics.wheatoncollege.edu/"&gt;Lexomics&lt;/a&gt; website.&lt;/span&gt;</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <scalar:continue_to_content_id>314553</scalar:continue_to_content_id>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-08-16T20:22:36+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:834847"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/lexomics"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/lexos">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2015-08-16T00:13:08+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:173674"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/lexos.6"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/lexos.6"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/lexos.6">
    <ov:versionnumber>6</ov:versionnumber>
    <dcterms:title>The Lexos workflow</dcterms:title>
    <dcterms:description>The main starting page for the Lexos software path</dcterms:description>
    <sioc:content>So, you&amp;#39;ve got a group of texts and you want to explore them in new (computational) ways. But, where to start? What to do first? There are many decisions to make as you apply computational methods to your digital files.&lt;br /&gt;&lt;br /&gt;&lt;strong&gt;Upload --&amp;gt; Scrub --&amp;gt; Segment --&amp;gt; Count --&amp;gt; Cull --&amp;gt; Analyze --&amp;gt; Visualize &lt;/strong&gt;&lt;br /&gt;&lt;em&gt;(follow a path or jump around, repeat as needed)&lt;/em&gt;&lt;br /&gt;&lt;br /&gt;The &lt;em&gt;Lexos&lt;/em&gt; workflow provides a user experience that calls attention to the series of decisions you must make when working with digital texts. Together, a series of decisions in a workflow represents your experiment&amp;#39;s methodology, essentially the Methods section in a publication. In addition to providing entry points for discussions of the workflow (e.g., &lt;span style="line-height: 13.8666658401489px;"&gt;sharing effective practices when making choices),&lt;/span&gt; it has not escaped our notice that by explicitly addressing the many steps in the process strengthens the dissemination of results and&amp;nbsp;lends to&amp;nbsp;the repeatability of experiments.</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3689"/>
    <dcterms:created>2016-06-02T14:21:22+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:777328"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/lexos"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="urn:scalar:path:777328:839675:1">
    <scalar:urn rdf:resource="urn:scalar:path:777328:839675:1"/>
    <oac:hasBody rdf:resource="http://scalar.usc.edu/works/lexos/lexos.6"/>
    <oac:hasTarget rdf:resource="http://scalar.usc.edu/works/lexos/the-lexomics-workflow.30#index=1"/>
    <rdf:type rdf:resource="http://www.openannotation.org/ns/Annotation"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/the-lexomics-workflow">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/6902"/>
    <dcterms:created>2015-06-02T07:17:11+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:158915"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/the-lexomics-workflow.30"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/the-lexomics-workflow.30"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/the-lexomics-workflow.30">
    <ov:versionnumber>30</ov:versionnumber>
    <dcterms:title>The Lexomics Workflow</dcterms:title>
    <sioc:content>&lt;p&gt;We call Lexos &amp;quot;An Integrated Lexomics Workflow&amp;quot; because it brings together many of the processing steps we in the Lexomics project regularly perform in our research. Some history of the Lexomics project may give some useful perspective on what we mean by a workflow. When the Lexomics project began, it consisted of three simple PERL scripts: one to clean-up texts, one to cut them, and one to perform cluster analysis on them. Each script had to be run in sequence. So, after a while, it made sense to create a single tool that would guide the user from one to the next. It then became clear the tool&amp;#39;s interface could allow the user to go back to earlier steps, tweak the settings, and then repeat their experiments. There were in fact many ways in which a user could design experiments using a single tool, and the tool could help the user manage their activities and, perhaps more importantly, to think critically about their process. Thus was Lexos born.&lt;br /&gt;&lt;br /&gt;While the strictly linear steps of its origins are no longer the only possible approaches you can adopt when using Lexos, they provided an important insight about how computational text analysis workflows are constructed. They essentially have three basic steps: &lt;strong&gt;pre-processing (scrubbing)&lt;/strong&gt;, &lt;strong&gt;analysis&lt;/strong&gt;, and &lt;strong&gt;visualization&lt;/strong&gt;. It is not always possible to clearly separate these activities. Even in our earliest scripts, the first two were pre-processing steps and the last, which plotted a tree diagram of the cluster analysis, combined analysis and visualization. But, as Lexos has developed, we have tried to make this its organizing principle, encouraging the user to proceed from text preparation to simple visualization of their data to more complex analysis. This is particularly useful for entry-level users and those whose training has not explored the issues raised by computational methods. (&lt;em&gt;In the Margins&lt;/em&gt; is our attempt to position the process of computational text analysis side by side with its product.) Lexos is thus designed to enable newcomers to the field to adopt the Lexomics workflow, empowering them to do sophisticated work in relatively little time.&lt;/p&gt;</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <scalar:continue_to_content_id>158915</scalar:continue_to_content_id>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-08-23T15:20:33+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:839675"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/the-lexomics-workflow"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/manage">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-08-12T12:56:17+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:314084"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/manage.2"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/manage.2"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/manage.2">
    <ov:versionnumber>2</ov:versionnumber>
    <dcterms:title>The Manage Tool</dcterms:title>
    <dcterms:description>Manual page for the Lexos Manage tool</dcterms:description>
    <sioc:content>&lt;p&gt;&lt;b&gt;Manage&lt;/b&gt; is the tool you use to perform various types of &amp;quot;housekeeping&amp;quot; on documents in your Lexos workspace. In addition to documents derived from files you have uploaded, &lt;b&gt;Manage&lt;/b&gt; will also list documents created by other tools such as segments produced by the Cutter tool.&lt;/p&gt;&lt;p&gt;Use the &lt;b&gt;Manage&lt;/b&gt; tool for the following purposes&lt;/p&gt;&lt;ul&gt;&lt;li&gt;To activate and de-activate documents in your workspace. By default, most Lexos tools will only operate on your active documents.&lt;/li&gt;&lt;li&gt;To delete unwanted documents from your workspace.&lt;/li&gt;&lt;li&gt;To re-name or classify documents in your workspace.&lt;/li&gt;&lt;/ul&gt;&lt;h4&gt;The Manage Interface&lt;/h4&gt;&lt;p&gt;Documents in your workspace are listed in the form of a table. When the uploaded file from which each document is derived will be listed by filename in the &lt;b&gt;Original Source&lt;/b&gt; column. The &lt;b&gt;Document Name&lt;/b&gt; column lists the filename without the extension. If you use Lexos tools to create new documents based on your uploaded files, the original filename will be displayed in the &lt;b&gt;Original Source&lt;/b&gt; column, and a new name will be generated for the &lt;b&gt;Document Name&lt;/b&gt; column. Document names can be changed as described in &lt;b&gt;Using the Context Menu&lt;/b&gt; below.&lt;/p&gt;&lt;p&gt;Be default, documents created by file upload or a Lexos tool do not have an associated class, so the &lt;b&gt;Class Label&lt;/b&gt; column is empty. The &lt;b&gt;Excerpt&lt;/b&gt; column shows the beginning and end of each document separated by an ellipsis (...). Columns can be sorted alphabetically clicking on the column header. The table highlights columns in blue to show which column is being used to sort the listed documents. If you have a large table, you can filter it down to a few rows containing keywords entered in the &lt;b&gt;Search&lt;/b&gt; field. The text of the entire table is searched, so matches may be found in any column. You may use the &lt;b&gt;Display&lt;/b&gt; dropdown menu to increase the number of rows displayed, or you can use the pagination links at the bottom right of the table to paginate through smaller sets of rows.&lt;/p&gt;&lt;h4&gt;Activating, De-Activating, and Deleting Documents&lt;/h4&gt;&lt;p&gt;By default, all documents are activated when they are uploaded. Rows containing active documents are highlighted in green. The following methods can be used to manage the active state of documents:&lt;/p&gt;&lt;ul&gt;&lt;li&gt;&lt;b&gt;Single Click&lt;/b&gt;: This will de-activate all documents and toggle the state of the row clicked. If it is active, it will be de-activated. If it is not active, it will be activated.&lt;/li&gt;&lt;li&gt;&lt;b&gt;Control or Command Click&lt;/b&gt;: This will toggle the state of the row clicked without affecting the state of any other rows.&lt;/li&gt;&lt;li&gt;&lt;b&gt;Shift Click&lt;/b&gt;: This activate ranges of rows. Shift-clicking on a row will activate documents in all rows between the row clicked and the first active row above or below the row clicked.&lt;/li&gt;&lt;li&gt;&lt;b&gt;Drag Click&lt;/b&gt;: Clicking on a row with the mouse button held down will activate or de-activate all rows between the row clicked and the row the mouse cursor is over when the mouse button is released.&lt;/li&gt;&lt;li&gt;&lt;b&gt;Right Click&lt;/b&gt;: This will open the context menu. See &lt;b&gt;Using the Context Menu&lt;/b&gt; below.&lt;/li&gt;&lt;li&gt;&lt;b&gt;The Select All and Deselect All Buttons&lt;/b&gt;: These are useful because they activate and de-activate all the documents in your workspace, not just those displayed on the page.&lt;/li&gt;&lt;/ul&gt;&lt;p&gt;Documents may also be activated and de-activated using the &lt;b&gt;Context Menu&lt;/b&gt; as described below.&lt;/p&gt;&lt;p&gt;Certain tools such as &lt;b&gt;Word Cloud&lt;/b&gt; allow you to select and de-select sub-sets of your active documents. These selections apply only within the given tool and do not affect whether the documents are active or not throughout the Lexos suite. If you need to change the state of a document so that it is or is not accessible to all tools, you should do this using &lt;b&gt;Manage&lt;/b&gt;.&lt;/p&gt;&lt;h4&gt;Deleting Documents&lt;/h4&gt;&lt;p&gt;Deleting individual documents from the workspace is probably achieved most easily achieved using the &lt;b&gt;Context Menu&lt;/b&gt; as described below. However, you can deselect all documents, activate only the document you wish to delete, and then click the &lt;b&gt;Delete Selected&lt;/b&gt; button. This button is probably more useful when you have multiple active documents, as it will delete them all at once. Make sure that you have de-activated any documents you do not wish to delete.&lt;/p&gt;&lt;h4&gt;Using the Context Menu&lt;/h4&gt;&lt;p&gt;Right-clicking on a table cell or row will open the context menu. It has the following options:&lt;/p&gt;&lt;ul&gt;&lt;li&gt;&lt;b&gt;Preview Document&lt;/b&gt;: This will open a dialog containing the entire text of your document (without formatting or white spaces). Note that longer documents can take a while to load, so please be patient.&lt;/li&gt;&lt;li&gt;&lt;b&gt;Edit Document Name&lt;/b&gt;: This function allows you to create a new name for the document in the row you have clicked. To change the name, enter your new name in the dialog form field and click &lt;b&gt;Save&lt;/b&gt;.&lt;/li&gt;&lt;li&gt;&lt;b&gt;Edit Document Class&lt;/b&gt;: This function allows you to create a class label for the document in the row you have clicked. Enter the label you wish to identify with the class in the dialog form field and click &lt;b&gt;Save&lt;/b&gt;. See further the section on document classes below.&lt;/li&gt;&lt;li&gt;&lt;b&gt;Delete Document&lt;/b&gt;: This function will delete the individual document in the row you have clicked.&lt;/li&gt;&lt;li&gt;&lt;b&gt;Select All Documents and Deselect All Documents&lt;/b&gt;: These options have the same function as the &lt;b&gt;Select All&lt;/b&gt; and &lt;b&gt;Deselect All&lt;/b&gt; buttons.&lt;/li&gt;&lt;li&gt;&lt;b&gt;Apply Class to Selected Documents&lt;/b&gt;: If you have multiple active documents, this option will allow you to apply a class label to all of them at once. Enter the label you wish to identify with the class in the dialog form field and click &lt;b&gt;Save&lt;/b&gt;. See further the section on document classes below.&lt;/li&gt;&lt;li&gt;&lt;b&gt;Delete Selected Documents&lt;/b&gt;: If you have multiple active documents, this option will allow you to delete them all at once. It has the same function as the &lt;b&gt;Delete Selected&lt;/b&gt; button.&lt;/li&gt;&lt;/ul&gt;&lt;h4&gt;Classifying Documents&lt;/h4&gt;&lt;p&gt;Document classes are groups of documents identified as belonging to the same category defined by some human-assigned criterion. For instance, a collection of novels might be separated into two classes based on whether they were published in Britain or the United States. Gender, genre, and date of authorship might also be used to classify documents. Lexos&amp;#39; class labels allow you to assign classes to documents and sort by class in the &lt;b&gt;Manage&lt;/b&gt; tool. At present, document classes are under-utilized elsewhere in the Lexos suite, but they are an important part of the &lt;b&gt;Topwords&lt;/b&gt; tool. In general, you should assign class labels in &lt;b&gt;Manage&lt;/b&gt; before going to &lt;b&gt;Topwords&lt;/b&gt;.&lt;/p&gt;</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-08-12T13:50:28+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:831436"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/manage"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/manual">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-08-15T18:02:04+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:314553"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/manual.10"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/manual.10"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/manual.10">
    <ov:versionnumber>10</ov:versionnumber>
    <dcterms:title>Manual</dcterms:title>
    <dcterms:description>Start page for the Lexos Manual</dcterms:description>
    <sioc:content>&lt;h3&gt;Introduction&lt;/h3&gt;&lt;p&gt;The Lexos Manual is the &amp;quot;how to&amp;quot; guide for the Lexos suite. Each tool is documented with instructions for how to use the various configurations in the interface. The manual attempts to present a straightforward account of how to use Lexos, but it also hints at the wider intellectual issues the user is presented with in using a tool like Lexos. In such cases, the Manual often links to more in-depth discussions in the &lt;strong&gt;Topics&lt;/strong&gt; section of &lt;em&gt;In the Margins&lt;/em&gt;.&lt;/p&gt;</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <scalar:continue_to_content_id>173669</scalar:continue_to_content_id>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3689"/>
    <dcterms:created>2018-05-23T20:50:16+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:1817392"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/manual"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="urn:scalar:path:1817392:839689:1">
    <scalar:urn rdf:resource="urn:scalar:path:1817392:839689:1"/>
    <oac:hasBody rdf:resource="http://scalar.usc.edu/works/lexos/manual.10"/>
    <oac:hasTarget rdf:resource="http://scalar.usc.edu/works/lexos/how-to-run-lexos.2#index=1"/>
    <rdf:type rdf:resource="http://www.openannotation.org/ns/Annotation"/>
  </rdf:Description>

  <rdf:Description rdf:about="urn:scalar:path:1817392:839675:2">
    <scalar:urn rdf:resource="urn:scalar:path:1817392:839675:2"/>
    <oac:hasBody rdf:resource="http://scalar.usc.edu/works/lexos/manual.10"/>
    <oac:hasTarget rdf:resource="http://scalar.usc.edu/works/lexos/the-lexomics-workflow.30#index=2"/>
    <rdf:type rdf:resource="http://www.openannotation.org/ns/Annotation"/>
  </rdf:Description>

  <rdf:Description rdf:about="urn:scalar:path:1817392:839906:3">
    <scalar:urn rdf:resource="urn:scalar:path:1817392:839906:3"/>
    <oac:hasBody rdf:resource="http://scalar.usc.edu/works/lexos/manual.10"/>
    <oac:hasTarget rdf:resource="http://scalar.usc.edu/works/lexos/interface.2#index=3"/>
    <rdf:type rdf:resource="http://www.openannotation.org/ns/Annotation"/>
  </rdf:Description>

  <rdf:Description rdf:about="urn:scalar:path:1817392:834849:4">
    <scalar:urn rdf:resource="urn:scalar:path:1817392:834849:4"/>
    <oac:hasBody rdf:resource="http://scalar.usc.edu/works/lexos/manual.10"/>
    <oac:hasTarget rdf:resource="http://scalar.usc.edu/works/lexos/upload-tool.3#index=4"/>
    <rdf:type rdf:resource="http://www.openannotation.org/ns/Annotation"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/upload-tool">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-08-12T12:31:31+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:314077"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/upload-tool.3"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/upload-tool.3"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/upload-tool.3">
    <ov:versionnumber>3</ov:versionnumber>
    <dcterms:title>The Upload Tool</dcterms:title>
    <dcterms:description>Manual page for the Lexos Upload tool</dcterms:description>
    <sioc:content>&lt;p&gt;&lt;b&gt;Upload&lt;/b&gt; is the standard starting point for the Lexos workflow. When you begin a new session or reset your workspace, you will be automatically re-directed to &lt;b&gt;Upload&lt;/b&gt;.&lt;/p&gt;&lt;p&gt;Use of the tool is fairly straightforward. Drag your document files into the box labeled &lt;b&gt;drop files here&lt;/b&gt;, or click the &lt;b&gt;Browse&lt;/b&gt; button to use your web browser&amp;#39;s file browser to locate your files. Most browsers will allow you to shift- or control-click to select multiple files.&lt;/p&gt;&lt;p&gt;There are some restrictions on file upload size in order to prevent the browser from hanging. Nevertheless, upload times may be slow for large files, particularly if you are working over the internet. The maximum file size of 250MB is approximately the size of of nine Webster&amp;#39;s Unabridged Dictionaries. If you experience a problem, try uploading smaller files, or, if you are uploading many files, try uploading them in smaller batches.&lt;/p&gt;&lt;p&gt;Lexos accepts files in &lt;code&gt;.txt&lt;/code&gt;, &lt;code&gt;.html&lt;/code&gt;, &lt;code&gt;.xml&lt;/code&gt;, and &lt;code&gt;.sgml&lt;/code&gt;. Make sure that your filenames contain these extensions.&lt;/p&gt;&lt;p&gt;Once you have selected your files, they will begin to upload, one at a time. As each upload is complete, you will see a notification at the bottom of the screen shortly after the &lt;b&gt;Ready For Files To Upload&lt;/b&gt; progress bar has said &amp;quot;Complete!&amp;quot; The bigger the file the longer it will take to upload and show up on the page. After uploading is complete, each file is considered a document by Lexos. You can activate, de-activate, and re-label, and classify your documents using the Manage tool.&lt;/p&gt;&lt;p&gt;&lt;b&gt;Note on character encoding&lt;/b&gt;: Lexos will automatically convert all files to &lt;a target="_blank" href="https://en.wikipedia.org/wiki/UTF-8"&gt;UTF-8 character encoding&lt;/a&gt;. If you are uploading HTML, XML, or SGML files that contain special characters, the Scrubber tool will help you to convert them to UTF-8 characters.&lt;/p&gt;&lt;h4&gt;The Lexos Beta Web Scraper&lt;/h4&gt;&lt;p&gt;At present, your documents must be available as files on your computer. However, Lexos has a Beta web scraper tool, which will allow you to download files off the internet. This is especially useful when you are using files from sources such as &lt;a href="https://www.gutenberg.org/"&gt;Project Gutenberg&lt;/a&gt;. To enable the web scraper, click the &amp;quot;Gear&amp;quot; icon in the top right corner of the screen and select the &lt;b&gt;Use Beta functions&lt;/b&gt; checkbox. A link to the web scraper tool will appear above the &lt;b&gt;Browse&lt;/b&gt; button. Wherever possible, use it to download plain text files since, otherwise, you will download all the HTML markup in a web page (this can be removed using the Scrubber tool). Upload times may vary, depending on internet speeds. If the process seems to hang, try uploading fewer urls. Large-scale web scraping should not be done in Lexos.&lt;/p&gt;</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-08-16T20:30:06+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:834849"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/upload-tool"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="urn:scalar:path:1817392:831436:5">
    <scalar:urn rdf:resource="urn:scalar:path:1817392:831436:5"/>
    <oac:hasBody rdf:resource="http://scalar.usc.edu/works/lexos/manual.10"/>
    <oac:hasTarget rdf:resource="http://scalar.usc.edu/works/lexos/manage.2#index=5"/>
    <rdf:type rdf:resource="http://www.openannotation.org/ns/Annotation"/>
  </rdf:Description>

  <rdf:Description rdf:about="urn:scalar:path:1817392:1052905:6">
    <scalar:urn rdf:resource="urn:scalar:path:1817392:1052905:6"/>
    <oac:hasBody rdf:resource="http://scalar.usc.edu/works/lexos/manual.10"/>
    <oac:hasTarget rdf:resource="http://scalar.usc.edu/works/lexos/scrubber.46#index=6"/>
    <rdf:type rdf:resource="http://www.openannotation.org/ns/Annotation"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/scrubber">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/6902"/>
    <dcterms:created>2015-06-02T07:37:46+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:158923"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/scrubber.46"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/scrubber.46"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/scrubber.46">
    <ov:versionnumber>46</ov:versionnumber>
    <dcterms:title>The Scrubber Tool</dcterms:title>
    <dcterms:description>Manual page for the Lexos Scrubber tool</dcterms:description>
    <sioc:content>&lt;p&gt;Preprocessing your texts, what we refer to as &amp;quot;scrubbing&amp;quot;, is a critical step in the Lexos workflow. In order to facilitate a conscious consideration of the many small decisions required, scrubbing options are isolated into individual choices. If for no other reason, your careful deliberation and choice of the many options facilitates a replication of your analyses in the future, both by you and others who wish to verify your experiment.&lt;/p&gt;&lt;p&gt;The Scrubber tool interface allows you to select and combine options on the left side of the screen. Click the &lt;strong&gt;Preview Scrubbing&lt;/strong&gt; button to see the results in the preview windows below. At this point, only the beginning and ending of each document is displayed, separated by an ellipsis (&amp;hellip;). When you are satisfied that you have achieved the desired effect, click the &lt;strong&gt;Apply Scrubbing&lt;/strong&gt; button. Your documents will be scrubbed, and the scrubbed versions will be used by all the other Lexos tools.&lt;/p&gt;&lt;p&gt;Scrubbing affects all active documents and cannot be undone. So make sure to de-activate any documents you do not wish to scrub using the &lt;strong&gt;Manage&lt;/strong&gt; tool. If you apply scrubbing and later wish to revert to the unscrubbed version, you will have to upload another copy to Lexos.&lt;/p&gt;&lt;p&gt;Scrubbing is an algorithm: a series of steps applied in a specific order. If you wish to change that order, you will need to de-select some options, scrub, re-select them, and then scrub again. The order of operations is provided in &lt;strong&gt;The Lexos Scrubber Algorithm&lt;/strong&gt; section below.&lt;/p&gt;&lt;h3&gt;Scrubbing Options&lt;/h3&gt;&lt;ol&gt;&lt;li&gt;&lt;strong&gt;Remove &lt;a href="https://www.gutenberg.org"&gt;Project Gutenberg&lt;/a&gt; boilerplate material&lt;/strong&gt;: Upon entering the Scrubber page, if you have uploaded a file from the Project Gutenberg website without removing the boilerplate material (i.e., text added by the Project Gutenberg site at the top and license material at the end of the text), you will receive the following warning:&lt;blockquote&gt;&lt;p&gt;One or more files you uploaded contain Project Gutenberg licensure material. You should remove the beginning and ending material, save, and re-upload the edited version. If you Apply Scrubbing with a text with Gutenberg boilerplate, Lexos will attempt to remove the majority of the Project Gutenberg Licensure, however there may still be some unwanted material left over.&lt;/p&gt;&lt;/blockquote&gt;&lt;p&gt;Note that if you select the &amp;lsquo;Apply Scrubbing&amp;rsquo; button without removing this extra text, Lexos will attempt to remove the Project Gutenberg boilerplate material at the top and end of the file. However, since Project Gutenberg texts do not have a consistent boilerplate format, we suggest you remove the boilerplate material using a text editor before uploading it to Lexos in order to prevent unwanted text from being included in subsequent analyses, e.g., including Project Gutenberg licensure material in your word counts. If you choose to let Lexos do the work for you, we recommend that you preview the beginning and ending of the document after you have scrubbed in the &lt;a href="manage"&gt;Manage&lt;/a&gt; tool in order to ensure that Lexos has not left any boilerplate or deleted any of your text. Lexos&amp;rsquo; attempt to remove start and ending boilerplate material only applies to files from the Project Gutenberg website. When choosing a file from this website, we recommend the &amp;ldquo;Plain Text UTF-8&amp;rdquo; version. It is smaller, so it will upload faster, and you will not have to remove any HTML markup.&lt;/p&gt;&lt;/li&gt;&lt;li&gt;&lt;p&gt;&lt;strong&gt;Remove All Punctuation&lt;/strong&gt;: Lexos assumes that uploaded files may be in any language and automatically converts them to &lt;a target="_blank" href="https://en.wikipedia.org/wiki/Unicode"&gt;Unicode&lt;/a&gt; using &lt;a target="_blank" href="https://en.wikipedia.org/wiki/UTF-8"&gt;UTF-8 character encoding&lt;/a&gt;. This requires that Lexos recognize punctuation marks from a wide variety of languages. All Unicode characters have an associated set of metadata for classifying its &amp;ldquo;type&amp;rdquo;, e.g. as a letter, punctuation, or symbol. If the &lt;strong&gt;Remove All Punctuation&lt;/strong&gt; option is selected, any Unicode character in each of the active texts with a &amp;ldquo;Punctuation Character Property&amp;rdquo; (that character&amp;rsquo;s property begins with a &amp;lsquo;P&amp;rsquo;) or a Symbol Character Property (begins with &amp;lsquo;S&amp;rsquo;) is removed. A guide to Unicode Character Categories can be found on &lt;a target="_blank" href="http://www.fileformat.info/info/unicode/category/index.htm"&gt;fileformat.info&lt;/a&gt;.&lt;/p&gt;&lt;p&gt;If &lt;strong&gt;Remove All Punctuation&lt;/strong&gt; is selected, three additional sub-options are available:&lt;/p&gt;&lt;ul&gt;&lt;li&gt;&lt;strong&gt;Keep Hyphens&lt;/strong&gt;: Selecting this option will change all variations of Unicode hyphens to a single type of hyphen (&amp;quot;-&amp;quot;) and this will be left in the text. Hyphenated words (e.g., &amp;ldquo;computer-aided&amp;rdquo;) will be subsequently treated as a single token. Further discussion of the limitations can be found [here](link to scrubbing-topic/keep-hyphen).&lt;/li&gt;&lt;li&gt;&lt;strong&gt;Keep Word-Internal Apostrophes&lt;/strong&gt;: If this option is selected, apostrophes will be retained in contractions (e.g., &lt;em&gt;can&amp;rsquo;t&lt;/em&gt;) and possessives (&lt;em&gt;Scott&amp;rsquo;s&lt;/em&gt;), but not those in plural possessives (&lt;em&gt;students&amp;rsquo;&lt;/em&gt; becomes the term&amp;nbsp;&lt;em&gt;students&lt;/em&gt;) nor those that appear at the start of a token (&lt;em&gt;&amp;#39;bout&lt;/em&gt; becomes the term&amp;nbsp;&lt;em&gt;bout&lt;/em&gt;). Further discussion of the limitations can be found [here](link to scrubbing-topic/keep-word-internal-apostrophes).&lt;/li&gt;&lt;li&gt;&lt;strong&gt;Keep Ampersands&lt;/strong&gt;: This option will not treat ampersands as punctuation marks and will retain them in the text. Note that HTML, XML, and SGML entities such as &lt;code&gt;&amp;amp;aelig; &lt;/code&gt; (&lt;em&gt;&amp;aelig;&lt;/em&gt;) are handled separately and prior to the &lt;strong&gt;Keep Ampersands&lt;/strong&gt; option. You can choose how to convert these entities to standard Unicode characters using the &lt;strong&gt;Special Characters&lt;/strong&gt; option.&lt;/li&gt;&lt;/ul&gt;&lt;/li&gt;&lt;li&gt;&lt;strong&gt;Make Lowercase&lt;/strong&gt;: Converts all uppercase characters to lowercase characters so that the tokens &lt;em&gt;The&lt;/em&gt; and &lt;em&gt;the&lt;/em&gt; will be considered as the same term. In addition, all contents (whether in uploaded files or entered manually) for the &lt;strong&gt;Stop Words/Keep Words&lt;/strong&gt;, &lt;strong&gt;Lemmas&lt;/strong&gt;, &lt;strong&gt;Consolidations&lt;/strong&gt;, or &lt;strong&gt;Special Characters&lt;/strong&gt; options will also have all uppercase characters changed to lowercase. Lowercase is not applied inside any HTML, XML, or SGML markup tags remaining in the text.&lt;/li&gt;&lt;li&gt;&lt;strong&gt;Remove Digits&lt;/strong&gt;: Removes all number characters from the text. Similar to the handling of punctuation marks, any Unicode character in each of the active texts with a &amp;ldquo;Number Character Property&amp;rdquo; is removed. For example, this option will remove a Chinese three (㈢) and Eastern Arabic six (۶) from the text. Note: at present, Lexos does not match Real numbers as a unit. For example, for &lt;em&gt;3.14&lt;/em&gt;, Lexos will remove (only) the 3, 1, and 4 and the decimal point will be removed only if the &lt;strong&gt;Remove All Punctuation&lt;/strong&gt; option is selected. &lt;strong&gt;Remove Digits&lt;/strong&gt; is not applied inside any HTML, XML, or SGML markup tags remaining in the text.&lt;/li&gt;&lt;li&gt;&lt;strong&gt;Remove Whitespace&lt;/strong&gt;: Removes all whitespace characters (blank spaces, tabs, and line breaks), except in HTML, XML, and SGML markup tags. Removing whitespace characters may be useful when you are working with non-Western languages such as Chinese that do not use whitespace for word boundaries. In addition, this option may be desired when tokenizing by character n-grams if you do not want spaces to be part of your n-grams. See the section on &lt;a href="link%20to%20tokenize%20page"&gt;Tokenization&lt;/a&gt; for further discussion on tokenizing by character n-grams. If &lt;strong&gt;Remove Whitespace&lt;/strong&gt; is selected the following sub-options are available to allow you to fine-tune the handling of whitespace:&lt;ul&gt;&lt;li&gt;&lt;strong&gt;Remove Spaces&lt;/strong&gt;: each &lt;em&gt;blank-space&lt;/em&gt; will be removed.&lt;/li&gt;&lt;li&gt;&lt;strong&gt;Remove Tabs&lt;/strong&gt;: each tab character ( &lt;code&gt;\t &lt;/code&gt;) will be removed.&lt;/li&gt;&lt;li&gt;&lt;strong&gt;Remove Line Break&lt;/strong&gt;: each newline character ( &lt;code&gt;\n &lt;/code&gt;) and carriage return character ( &lt;code&gt;\r &lt;/code&gt;) will be removed.&lt;/li&gt;&lt;/ul&gt;&lt;/li&gt;&lt;li&gt;&lt;strong&gt;Scrub Tags&lt;/strong&gt;: Handles markup tags in angular brackets, such as those used in XML, HTML, and SGML. In markup languages like these, start and end tags like &lt;code&gt;&amp;lt;p&amp;gt;...&amp;lt;/p&amp;gt; &lt;/code&gt; are used to designate an &amp;ldquo;element&amp;rdquo;. Elements may be modified by &amp;ldquo;attributes&amp;rdquo; specified inside the start tag. For instance, a text using the the &lt;a target="_blank" href="http://www.tei-c.org/index.xml"&gt;Text Encoding Initiative (TEI)&lt;/a&gt; specification for XML might contain the markup &lt;code&gt;&amp;lt;p rend=&amp;quot;italic&amp;quot;&amp;gt;...&amp;lt;/p&amp;gt; &lt;/code&gt; for a paragraph in italics. When this option is selected, a gear icon will appear. Click the icon to open the tag scrubbing dialog. This will allow you to choose one of four options to handle each type of tag or to handle all the tags at once:&lt;ul&gt;&lt;li&gt;&lt;strong&gt;Remove Tag Only (default)&lt;/strong&gt;: Removes the start and end tags but keeps the content in between. For instance, &lt;code&gt;&amp;lt;p&amp;gt;Some text&amp;lt;/p&amp;gt; &lt;/code&gt; will be replaced by &lt;code&gt;Some text &lt;/code&gt;.&lt;/li&gt;&lt;li&gt;&lt;strong&gt;Remove Element and All Its Contents&lt;/strong&gt;: Removes the start and end tags and all the content in between. For instance, &lt;code&gt;&amp;lt;p&amp;gt;Some text&amp;lt;/p&amp;gt; &lt;/code&gt; will be removed entirely.&lt;/li&gt;&lt;li&gt;&lt;strong&gt;Replace Element&amp;rsquo;s Contents with Attribute Value&lt;/strong&gt;: Replaces the element with the value of one of its attributes. Since elements may have multiple attributes, Lexos allows you to enter the name of the attribute you wish to use. For instance, if you have some markup like &lt;code&gt;&amp;lt;stage type=&amp;quot;setting&amp;quot;&amp;gt; Scene &amp;lt;view&amp;gt;Morning-room in Algernon&amp;#39;s flat in Half-Moon Street.&amp;lt;/view&amp;gt;&amp;lt;/stage&amp;gt; &lt;/code&gt;, you could use this option to replace the entire scene description with &lt;code&gt;setting &lt;/code&gt; if you entered &lt;code&gt;type &lt;/code&gt; as the attribute name.&lt;/li&gt;&lt;li&gt;&lt;strong&gt;Leave Tag Alone&lt;/strong&gt;: This option will leave the specified element untouched in the text. This is especially useful if you want to scrub only certain markup tags.&lt;/li&gt;&lt;/ul&gt;&lt;p&gt;&lt;strong&gt;Troubleshooting&lt;/strong&gt;: Lexos compiles a list of the tags in your documents by first attempting to parse the documents as XML. If the markup is not well-formed XML, it next tries to parse the documents as HTML using Python&amp;rsquo;s &lt;a target="_blank" href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/"&gt;BeautifulSoup&lt;/a&gt; library. This will generally work with the proviso that BeautifulSoup automatically converts all tags to lowercase. As a result, the Lexos scrubbing function will miss HTML (and SGML) tags that contain uppercase letters. In this case, you may have to check each of the tags Lexos finds to make sure it does not have uppercase letters in your original document. If you find that Lexos is not scrubbing tags containing capital letters, you will have to change these in an editor before uploading the files. This issue does not affect valid XML files since XML parsers are case sensitive. If Lexos is unable to compile an accurate list of the tags in your XML file, we recommend testing the file with an &lt;a target="_blank" href="http://www.w3schools.com/xml/xml_validator.asp"&gt;XML Validator&lt;/a&gt;.&lt;/p&gt;&lt;/li&gt;&lt;/ol&gt;&lt;h4&gt;Additional Options&lt;/h4&gt;&lt;ol&gt;&lt;li&gt;&lt;strong&gt;Stop Words/Keep Words&lt;/strong&gt;: &amp;ldquo;Stop Words&amp;rdquo; represents a list of words or terms to &lt;em&gt;remove&lt;/em&gt; from your documents, and &amp;ldquo;Keep Words&amp;rdquo; represents a list of words or terms that should remain in your documents with all other words removed. In both cases, words must be entered as comma-separated or line-separated lists like the following:&lt;pre&gt;&lt;code&gt;&lt;code&gt;a, some, that, the, which
a
some
that
the
which
&lt;/code&gt;
&lt;/code&gt;
&lt;/pre&gt; You may enter these lists manually in the provided form area or upload a file (e.g. &lt;code&gt;stopWords.txt &lt;/code&gt;). Note that the &amp;lt;b&amp;gt;Make Lowercase&amp;lt;/b&amp;gt; option will be applied to your list of stop/keep words if that option is also selected.&lt;/li&gt;&lt;li&gt;&lt;strong&gt;Lemmas&lt;/strong&gt;: Replaces all instances of terms in a list with a common replacement term called a &amp;ldquo;lemma&amp;rdquo;. Lemmas might be conceived of as dictionary headwords. Using the lemmas option will allow you to count a lemma and all of its variants (such grammatically inflected forms) as a single term. For instance, in Old English, the word for &amp;ldquo;king&amp;rdquo;, &lt;em&gt;cyning&lt;/em&gt; may occur as &lt;em&gt;cyninges&lt;/em&gt; (possessive) or &lt;em&gt;cyningas&lt;/em&gt; (plural), amongst other variants. If each of these forms occurs one time in a text, the &lt;strong&gt;Lemmas&lt;/strong&gt; function will instruct Lexos to treat this as three occurrences of the type &lt;em&gt;cyning&lt;/em&gt;. Lemmas are specified by providing a comma-separated list of variants followed by a colon and then the lemma. Multiple lemmas can be specified in separate lines as shown below:&lt;pre&gt;&lt;code&gt;cyninges, cyningas: cyning
Beowulfes, Beowulfe: Beowulf
&lt;/code&gt;
&lt;/pre&gt; The list may be entered manually in the form provided or uploaded from a file. Note that the &lt;strong&gt;Make Lowercase&lt;/strong&gt; option will be applied to your list of tokens and lemmas if that option is also selected. To replace individual characters with other characters, you should use the &lt;strong&gt;Consolidation&lt;/strong&gt; option.&lt;/li&gt;&lt;li&gt;&lt;strong&gt;Consolidations&lt;/strong&gt;: Replaces a list of characters with a different character. This is typically to consolidate symbols considered equivalent. For instance, in Old English the character common character &amp;ldquo;eth&amp;rdquo; &lt;em&gt;&amp;eth;&lt;/em&gt; is interchangeable with the character &amp;ldquo;thorn&amp;rdquo; &lt;em&gt;&amp;thorn;&lt;/em&gt;. The &lt;strong&gt;Consolidations&lt;/strong&gt; option allows you to choose to merge the two using a single character. Consolidations should be entered in the format &lt;code&gt;&amp;eth;: &amp;thorn; &lt;/code&gt;, where you wish to change all occurrences of &lt;code&gt;&amp;eth; &lt;/code&gt; to &lt;code&gt;&amp;thorn; &lt;/code&gt;. Multiple consolidations can be separated by commas or line breaks. Consolidations can be entered manually in&amp;nbsp;the provided form field or uploaded from a file. Note that the &lt;strong&gt;Make Lowercase&lt;/strong&gt; option will be applied to your list of characters if that option is also selected. To replace entire words (terms) with other words, you should use the &lt;strong&gt;Lemma&lt;/strong&gt; option.&lt;/li&gt;&lt;li&gt;&lt;strong&gt;Special Characters&lt;/strong&gt;: Replaces character entities with their glyph equivalents. A &lt;a target="_blank" href="https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references"&gt;character entity&lt;/a&gt; is a symbolic representation for an actual character symbol (glyph). Entities are used by markup languages like HTML, XML, and SGML when the symbol itself cannot be entered in the editor used to produce the text or when the method of rendering the character is left to independent software like a web browser. For instance, in HTML, the Old English character &amp;ldquo;aesc&amp;rdquo; (&lt;em&gt;&amp;aelig;&lt;/em&gt;) is represented with the entity &lt;code&gt;&amp;amp;aelig; &lt;/code&gt;. Since Lexos works entirely with Unicode characters you will most likely want to replace character entities with their Unicode equivalents prior to further analysis. The &lt;strong&gt;Special Characters&lt;/strong&gt; option can be used to replace entities like &lt;code&gt;&amp;amp;aelig; &lt;/code&gt; with its corresponding Unicode glyph &lt;em&gt;&amp;aelig;&lt;/em&gt;. Lexos provides four rule sets of pre-defined entities and their corresponding glyphs:&lt;ul&gt;&lt;li&gt;&lt;strong&gt;Early English HTML&lt;/strong&gt;: Transforms a variety of HTML entities used to encode Old English, Middle English, and Early Modern English into their corresponding glyphs.&lt;/li&gt;&lt;li&gt;&lt;strong&gt;Dictionary of Old English SGML&lt;/strong&gt;: Transforms SGML entities used by the &lt;em&gt;Dictionary of Old English&lt;/em&gt; into their corresponding glyphs.&lt;/li&gt;&lt;li&gt;&lt;strong&gt;MUFI 3&lt;/strong&gt;: Transforms entities specified in version 3.0 of the Medieval Unicode Font Initiative (MUFI 3) to their corresponding glyphs.&lt;/li&gt;&lt;li&gt;&lt;strong&gt;MUFI 4&lt;/strong&gt;: Transforms entities specified in version 4.0 of the Medieval Unicode Font Initiative (MUFI 4) to their corresponding glyphs.&lt;/li&gt;&lt;/ul&gt;&lt;p&gt;Note: Selecting MUFI 3 or MUFI 4 will convert entities specified by the Medieval Unicode Font Initiative (MUFI) to their Unicode equivalents. In this case, the Preview window will be changed to use the &lt;a target="_blank" href="http://junicode.sourceforge.net/"&gt;Junicode&lt;/a&gt; font, which correctly displays most MUFI characters. However, if you downloaded your files after scrubbing, these characters may not display correctly on your computer if you do not have a MUFI-compatible font installed. Information about MUFI and other MUFI-compatible fonts can be found on the &lt;a href="http://folk.uib.no/hnooh/mufi/"&gt;MUFI website&lt;/a&gt;.&lt;/p&gt;&lt;p&gt;Note: Any special characters that appear inside tags &lt;em&gt;will&lt;/em&gt; be modified.&lt;/p&gt;&lt;p&gt;You may also design your own rule set if you are not using a language covered by one of the pre-defined rule sets. To do this, enter your transformation rules in the provided form field. The entity should be separated from its replacement glyph by a comma (e.g. &lt;code&gt;&amp;amp;aelig;, &amp;aelig; &lt;/code&gt;). Multiple transformation rules should be listed on separate lines. The Lexomics Project welcomes submission of the new rule sets. Please use the &lt;strong&gt;Feedback and Support&lt;/strong&gt; button in Lexos or &lt;a target="_blank" href="https://urldefense.proofpoint.com/v2/url?u=https-3A__docs.google.com_a_wheatoncollege.edu_forms_d_e_1FAIpQLSddEsRE2PcserYwcjtNpBAMF-2DYRKVrL4H4LtWDxHeNKoVVxcA_viewform&amp;amp;d=CwMCaQ&amp;amp;c=Oo8bPJf7k7r_cPTz1JF7vEiFxvFRfQtp-j14fFwh71U&amp;amp;r=fkkkcAta9tNbJT0GbA-b8fBT5Vx0day25Z1KcBOKxKQ&amp;amp;m=pvw58nUgCb4t3z5cj9Zj2XFIXgBppHEM8aQoOb5vqpA&amp;amp;s=ZxoSL8vQIBP526hKabavc3SaECtb_M8nMnjGHo6MiSk&amp;amp;e="&gt;click here&lt;/a&gt;&lt;a href="http://junicode.sourceforge.net/"&gt;&lt;/a&gt; to contact us about a adding pre-defined rule set to Lexos.&lt;/p&gt;&lt;/li&gt;&lt;/ol&gt;&lt;h3&gt;Replacing Patterns&lt;/h3&gt;&lt;p&gt;Sometimes it is necessary to replace a pattern rather than a precise string. For instance, if a document contains multiple URLs like &lt;code&gt;http://lexos.wheatoncollege.edu&lt;/code&gt; and &lt;code&gt;http://scalar.usc.edu/works/lexos/&lt;/code&gt;, and you need to strip these URLs, method is required for matching all URLs without knowing what they are in advance. This is known as regular expression (regex) pattern matching. Lexos uses regular expressions internally to perform its scrubbing options, but, as of version 3.0, it does not provide a way for users to supply their own regular expression patterns. If users need to strip or replace patterns by regular expression, it will be necessary to perform that action using a separate script or tool. A useful regular expressions tutorial can be found at &lt;a href="https://regexone.com/" target="_blank"&gt;RegexOne&lt;/a&gt;. Most modern text editors like &lt;a href="https://www.sublimetext.com/" target="_blank"&gt;Sublime Text&lt;/a&gt; and &lt;a href="http://www.barebones.com/products/TextWrangler/" target="_blank"&gt;TextWrangler&lt;/a&gt; accept regular expressions in their search and replace functions, and users may find them to be a convenient means of performing actions with regular expressions. We hope to add a regular expression pattern matching to Lexos in the near future. &amp;nbsp;&lt;/p&gt;&lt;h3&gt;The Lexos Scrubber Algorithm&lt;/h3&gt;&lt;p&gt;Lexos scrubs documents by applying rules in the following order:&lt;/p&gt;&lt;h4&gt;&lt;u&gt;When the &lt;strong&gt;Preview Scrubbing&lt;/strong&gt; button is clicked&lt;/u&gt;&lt;/h4&gt;&lt;p class="p1"&gt;&lt;span class="s1"&gt;Markup tags in angular brackets are not affected by the rules below except rule 4. The actual text is not permanently modified at this point, but of course the Preview window shows a sample of what will be changed if you select **Apply Scrubbing**.&lt;/span&gt;&lt;/p&gt;&lt;p class="p1"&gt;&lt;span class="s1"&gt;1.&amp;nbsp; Remove Project Gutenberg boilerplate, if present&lt;br /&gt;2.&amp;nbsp; Convert stopwords, keepwords, lemmas, consolidations, and special characters to lowercase (the actual text is converted to lowercase later, see step #5 below).&lt;br /&gt;3.&amp;nbsp; Apply special character transformations.&lt;br /&gt;4.&amp;nbsp; Apply markup tag scrubbing rules.&lt;br /&gt;5.&amp;nbsp; Convert text to lowercase.&lt;br /&gt;6.&amp;nbsp; Apply consolidation rules.&lt;br /&gt;7.&amp;nbsp; Apply lemmatization rules.&lt;br /&gt;8.&amp;nbsp; Apply stopword/keepword lists.&lt;br /&gt;9.&amp;nbsp; Remove punctuation (hyphens, apostrophes, ampersands).&lt;br /&gt;10.&amp;nbsp; Remove digits.&lt;br /&gt;11.&amp;nbsp; Remove whitespace.&lt;/span&gt;&lt;br /&gt;&amp;nbsp;&lt;/p&gt;&lt;h4&gt;&lt;u&gt;When the &lt;strong&gt;Apply Scrubbing&lt;/strong&gt; button is clicked&lt;/u&gt;&lt;/h4&gt;&lt;p&gt;Markup tags in angular brackets are not affected by the rules below except rule 4.&lt;/p&gt;&lt;span style="line-height: 20.8px;"&gt;1.&amp;nbsp; Remove Project Gutenberg boilerplate, if present&lt;/span&gt;&lt;br style="line-height: 20.8px;" /&gt;&lt;span style="line-height: 20.8px;"&gt;2.&amp;nbsp; Convert stopwords, keepwords, lemmas, consolidations, and special characters to lowercase (the actual text is converted to lowercase later, see step #5 below).&lt;/span&gt;&lt;br style="line-height: 20.8px;" /&gt;&lt;span style="line-height: 20.8px;"&gt;3.&amp;nbsp; Apply special character transformations.&lt;/span&gt;&lt;br style="line-height: 20.8px;" /&gt;&lt;span style="line-height: 20.8px;"&gt;4.&amp;nbsp; Apply markup tag scrubbing rules.&lt;/span&gt;&lt;br style="line-height: 20.8px;" /&gt;&lt;span style="line-height: 20.8px;"&gt;5.&amp;nbsp; Convert text to lowercase.&lt;/span&gt;&lt;br style="line-height: 20.8px;" /&gt;&lt;span style="line-height: 20.8px;"&gt;6.&amp;nbsp; Apply consolidation rules.&lt;/span&gt;&lt;br style="line-height: 20.8px;" /&gt;&lt;span style="line-height: 20.8px;"&gt;7.&amp;nbsp; Apply lemmatization rules.&lt;/span&gt;&lt;br style="line-height: 20.8px;" /&gt;&lt;span style="line-height: 20.8px;"&gt;8.&amp;nbsp; Apply stopword/keepword lists.&lt;/span&gt;&lt;br style="line-height: 20.8px;" /&gt;&lt;span style="line-height: 20.8px;"&gt;9.&amp;nbsp; Remove punctuation (hyphens, apostrophes, ampersands).&lt;/span&gt;&lt;br style="line-height: 20.8px;" /&gt;&lt;span style="line-height: 20.8px;"&gt;10.&amp;nbsp; Remove digits.&lt;/span&gt;&lt;br style="line-height: 20.8px;" /&gt;&lt;span style="line-height: 20.8px;"&gt;​&lt;/span&gt;&lt;span style="line-height: 20.8px;"&gt;11.&amp;nbsp; Remove whitespace.&lt;/span&gt;&lt;br style="line-height: 20.8px;" /&gt;​</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <scalar:continue_to_content_id>158923</scalar:continue_to_content_id>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2017-03-05T09:29:49+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:1052905"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/scrubber"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="urn:scalar:path:1817392:831957:7">
    <scalar:urn rdf:resource="urn:scalar:path:1817392:831957:7"/>
    <oac:hasBody rdf:resource="http://scalar.usc.edu/works/lexos/manual.10"/>
    <oac:hasTarget rdf:resource="http://scalar.usc.edu/works/lexos/cut.7#index=7"/>
    <rdf:type rdf:resource="http://www.openannotation.org/ns/Annotation"/>
  </rdf:Description>

  <rdf:Description rdf:about="urn:scalar:path:1817392:834853:8">
    <scalar:urn rdf:resource="urn:scalar:path:1817392:834853:8"/>
    <oac:hasBody rdf:resource="http://scalar.usc.edu/works/lexos/manual.10"/>
    <oac:hasTarget rdf:resource="http://scalar.usc.edu/works/lexos/tokenize.13#index=8"/>
    <rdf:type rdf:resource="http://www.openannotation.org/ns/Annotation"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/tokenize">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/6902"/>
    <dcterms:created>2015-06-03T11:17:31+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:159092"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/tokenize.13"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/tokenize.13"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/tokenize.13">
    <ov:versionnumber>13</ov:versionnumber>
    <dcterms:title>The Tokenize/Count Tool</dcterms:title>
    <dcterms:description>Manual page for the Lexos Tokenizer tool</dcterms:description>
    <sioc:content>&lt;p&gt;The &lt;strong&gt;Tokenizer/Count&lt;/strong&gt; tool, also known as &lt;strong&gt;Tokenizer&lt;/strong&gt;, is the backbone for many functions in Lexos. Tokenization is the process of dividing a string of text into countable units called &amp;ldquo;tokens&amp;rdquo;. Tokens are typically individual characters or words, but they can also be &amp;ldquo;n-grams&amp;rdquo;, units composed of one or more sequences of characters or words. By default, Lexos divides text into tokens using spaces as token delimiters. However, it can be set to treat every character as a token or to treat n-gram sequences as tokens.&lt;/p&gt;&lt;p&gt;Once the text is divided into tokens, Lexos assembles a &lt;strong&gt;Document-Term Matrix (DTM)&lt;/strong&gt;. This is a table of &amp;ldquo;terms&amp;rdquo; (also called &amp;ldquo;types&amp;rdquo;)&amp;mdash;unique token forms&amp;mdash;that occur in the active documents. Lexos calculates the number of times each document contains each term to produce the DTM. It displays the DTM as a table where you can explore important statistical information about your texts. Note that text corpora containing a large number of documents or types can take a while to process, so please be patient. If the table is too big, it may cause your browser to hang, and you may be forced to download the DTM to a spreadsheet program and work there. Lexos attempts to warn you when it is likely that you will need to download your data. Even if Lexos is able to display your DTM quickly, you may wish to download the data for use in other&lt;/p&gt;&lt;h3&gt;Using the DTM Table&lt;/h3&gt;&lt;p&gt;By default, Lexos displays the DTM with documents listed in columns and terms listed in rows. You may choose to transpose the table by selecting the &lt;strong&gt;Documents as Rows, Terms as Columns&lt;/strong&gt; option. However, it is most likely that you will have relatively few documents and a relatively large number of terms. Transposing the matrix will produce a table with potentially hundreds or thousands of columns, requiring you to scroll horizontally to view them. Lexos will warn you when this is likely and give you the option to download the transposed table to a spreadsheet program, where you may find it easier to work. You may also click the eye icon to toggle the visibility of individual columns. If you change the setting between &lt;strong&gt;Documents as Columns, Terms as Rows&lt;/strong&gt; and &lt;strong&gt;Documents as Rows, Terms as Columns&lt;/strong&gt;, click the &lt;strong&gt;Regenerate Table&lt;/strong&gt; button to apply the change of setting.&lt;/p&gt;&lt;p&gt;By default Lexos displays 10 table rows per page, but you can change this using the &lt;strong&gt;Display&lt;/strong&gt; dropdown menu. You can also filter the rows by entering keywords in the &lt;strong&gt;Search&lt;/strong&gt; form. To sort the table, click on a column header. A small icon next to the arrow in the header label will indicate both which column is being used for sorting and whether the sort direction is ascending or descending. Lexos calculates totals and averages for both rows and columns.&lt;/p&gt;&lt;p&gt;To download the DTM, click the &lt;strong&gt;Download CSV&lt;/strong&gt; or the &lt;strong&gt;Download TSV&lt;/strong&gt; button. &amp;ldquo;CSV&amp;rdquo; is short for comma-separated values, whereas &amp;ldquo;TSV&amp;rdquo; is short for tab-separated values. In your downloaded file, a comma or a tab will serve as the column delimiter. Spreadsheet programs can usually open both formats, but you may find one or the other easier to use for your purposes.&lt;/p&gt;&lt;h3&gt;Using the Advanced Options&lt;/h3&gt;&lt;p&gt;The configuration options in the top right inset section of the &lt;strong&gt;Tokenizer&lt;/strong&gt; tool allow you to change how the DTM is built. An important feature of these options is that they are saved to your session and will apply to all the other Lexos tools that make use of the DTM. For instance, if you restrict your DTM to only the 10 most frequent terms in your corpus, this slice of your DTM will also be used to generate word clouds, cluster analyses, and so on. The same configuration options occur in the other Lexos tools, so it is possible to change the settings there. In &lt;strong&gt;Tokenizer&lt;/strong&gt;, you should click the &lt;strong&gt;Regenerate Table&lt;/strong&gt; button each time you change the settings to re-build the DTM with the new configuration.&lt;/p&gt;&lt;p&gt;&lt;strong&gt;Tokenizer&lt;/strong&gt; provides several methods of manipulating the DTM in the panel at the top right of the screen. Instructions for using these methods can be found in &lt;a href="advanced-options"&gt;Advanced Options&lt;/a&gt;.&lt;/p&gt;</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <scalar:continue_to_content_id>159092</scalar:continue_to_content_id>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-08-16T20:40:30+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:834853"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/tokenize"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="urn:scalar:path:1817392:839446:9">
    <scalar:urn rdf:resource="urn:scalar:path:1817392:839446:9"/>
    <oac:hasBody rdf:resource="http://scalar.usc.edu/works/lexos/manual.10"/>
    <oac:hasTarget rdf:resource="http://scalar.usc.edu/works/lexos/rolling-windows.2#index=9"/>
    <rdf:type rdf:resource="http://www.openannotation.org/ns/Annotation"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/rolling-windows">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-08-15T17:46:50+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:314550"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/rolling-windows.2"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/rolling-windows.2"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/rolling-windows.2">
    <ov:versionnumber>2</ov:versionnumber>
    <dcterms:title>The Rolling Windows Tool</dcterms:title>
    <dcterms:description>Manual page for the Lexos Rolling Windows tool</dcterms:description>
    <sioc:content>&lt;p&gt;&lt;strong&gt;Rolling window&lt;/strong&gt; analysis is a method of tracing the frequency of terms within a designated window of tokens over the course of a document. It can be used to identify small- and large-scale patterns of usage of individual features or to compare these patterns for multiple features. Rolling window analysis tabulates term frequency as part of a continuously moving metric, rather than in discrete segments. Beginning with the selection of a window, say 100 tokens, rolling window analysis traces the frequency of a term&amp;#39;s occurrence first within tokens 1-100, then 2 to 101, then 3, 102, and so on until the end of the document is reached. The result can be plotted as a line graph so that it is possible to observe gradual changes in a token&amp;rsquo;s frequency as the text progresses. Plotting different tokens on the same graph allows us to compare their frequencies.&lt;/p&gt;&lt;p&gt;The Lexos &lt;strong&gt;Rolling Windows Tool&lt;/strong&gt; performs this analysis. It has numerous options which are best understood as part of a workflow. In the Lexos interface, the steps of this workflow are numbered 1-6. Each of these options steps is discussed below.&lt;/p&gt;&lt;ol&gt;&lt;li&gt;&lt;strong&gt;Select Active Document:&lt;/strong&gt; Lexos performs rolling windows analysis on a single active document at a time. Use the radio buttons to select which document you would like to examine.&lt;/li&gt;&lt;li&gt;&lt;strong&gt;Select Calculation Type:&lt;/strong&gt; Lexos will plot either the average term frequency in each window (&lt;strong&gt;Rolling Average&lt;/strong&gt;) or the ratio of term frequencies if you are examining multiple terms (&lt;strong&gt;Rolling Ratio&lt;/strong&gt;).&lt;/li&gt;&lt;li&gt;&lt;strong&gt;Enter Search Terms:&lt;/strong&gt; These are the terms you wish to plot from your document. Enter up to 6 terms, separated by commas. When Lexos searches your document for these terms, it uses the document text, rather than the Document-Term Matrix (DTM) as its starting point. This means that you can choose to search for strings of text, individual words or terms (separated by spaces), or regular expressions (regex). A basic tutorial for using regex can be found at &lt;a target="_blank" href="https://regexone.com/"&gt;https://regexone.com/&lt;/a&gt;.&lt;/li&gt;&lt;li&gt;&lt;strong&gt;Define Window:&lt;/strong&gt; This is where you set the size of the window you want to use. It can consist of any number of characters, tokens (separated by spaces), or lines (separated by line breaks in the text). If your document contains milestones, click the checkbox, and the location of each milestone will be indicated on the rolling window graph by a vertical line.&lt;/li&gt;&lt;li&gt;&lt;strong&gt;Choose Display Options:&lt;/strong&gt; The &lt;strong&gt;Hide Individual Points&lt;/strong&gt; option (turned on by default) produces an uninterrupted line graph, which may be easier to read. Turning this option off will show the points where each term occurs in the document. Mousing over the point will display the location of the term in the token sequence (starting from 0), along with the average or ratio at that point in the window. The &lt;strong&gt;Black and White Only&lt;/strong&gt; option produces a non-color version of the graph that is suitable for downloading and publishing in journals.&lt;/li&gt;&lt;li&gt;&lt;strong&gt;Get Graph:&lt;/strong&gt; Click the &lt;strong&gt;Get Graph&lt;/strong&gt; button to generate the Rolling Windows graph. Once it has been generated, the screen will scroll automatically to the top of the graph. Download buttons will also appear both above and below the graph. You can download the data by clicking the &lt;strong&gt;CSV Matrix&lt;/strong&gt; button. This will give you a comma-separated values (CSV) file, which you can open in a spreadsheet program. To download the image, click either of the SVG buttons as appropriate for your browser. A new tab will open, and you can save it by right-clicking and saving the page.&lt;/li&gt;&lt;/ol&gt;&lt;h3&gt;Additional Graph Interactivity&lt;/h3&gt;&lt;p&gt;In addition to mousing over points if you have turned off the &lt;strong&gt;Hide Individual Points&lt;/strong&gt;, you can drag your mouse over portions of the bottom ribbon to magnify sections of the graph.&lt;/p&gt;</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-08-22T20:20:57+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:839446"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/rolling-windows"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="urn:scalar:path:1817392:832371:10">
    <scalar:urn rdf:resource="urn:scalar:path:1817392:832371:10"/>
    <oac:hasBody rdf:resource="http://scalar.usc.edu/works/lexos/manual.10"/>
    <oac:hasTarget rdf:resource="http://scalar.usc.edu/works/lexos/word-cloud.6#index=10"/>
    <rdf:type rdf:resource="http://www.openannotation.org/ns/Annotation"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/word-cloud">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/6902"/>
    <dcterms:created>2015-06-04T10:07:32+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:159678"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/word-cloud.6"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/word-cloud.6"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/word-cloud.6">
    <ov:versionnumber>6</ov:versionnumber>
    <dcterms:title>The Word Cloud Tool</dcterms:title>
    <dcterms:description>Manual page for the Lexos Word Cloud tool</dcterms:description>
    <sioc:content>&lt;p&gt;Word clouds are a method of visualizing the &lt;strong&gt;Document-Term Matrix&lt;/strong&gt;. They present terms arranged at angles for compactness, with each term sized according to its frequency within the text. Word clouds enable you to get a sense of the content in your corpus, and they are very good for presentations. However, they also have some well-known limitations (see the topics article on &lt;a href=""&gt;visualizing texts with word clouds&lt;/a&gt;). In some languages, individual tokens may not correspond to words, which will limit the usefulness of this method of visualization.&lt;/p&gt;&lt;p&gt;The Lexos &lt;strong&gt;Word Cloud&lt;/strong&gt; tool uses Jason Davies&amp;rsquo; excellent &lt;a target="_blank" href="https://www.jasondavies.com/wordcloud/"&gt;word cloud generator for d3.js&lt;/a&gt;&amp;mdash;with a few modifications&amp;mdash; to create beautiful, interactive word clouds. This implementation scales the size to ensure all terms fit within the layout. The color used for each does not convey meaning and is used only for aesthetic purposes.&lt;/p&gt;&lt;h3&gt;Generating Word Clouds&lt;/h3&gt;&lt;p&gt;Lexos allows you to choose some or all of your active documents from which to generate a word cloud. Once you have selected your documents using the checkboxes at the top right, click the &lt;strong&gt;Get Graph&lt;/strong&gt; button. After a few seconds, a word cloud will fade into view (be patient if you have selected large or many documents). Running your mouse cursor over each term in the word cloud will generate a tooltip showing the number of times it occurs in the documents you have selected. Click on the &lt;strong&gt;View Counts Table&lt;/strong&gt; button next to &lt;strong&gt;Get Graph&lt;/strong&gt; or below the word cloud to open a dialog containing a searchable, sortable table of the term counts in your word cloud.&lt;/p&gt;&amp;nbsp;&lt;p&gt;&lt;strong&gt;Warning&lt;/strong&gt;: The d3.js algorithm used by Lexos has an important limitation. It attempts to layout terms in as compact a manner as possible and is sometimes unable to find a fit for high frequency words. In these cases, these words are dropped from the word cloud. Because of this limitation, we highly recommend that you view the Counts Table in order to make sure that all the most frequent words are represented in the word cloud. If you find that this is not the case, try generating the word cloud again. Sometimes it will find a better layout in which the high frequency words fit. Using the layout options described below may also allow you to produce word clouds in which the missing words fit within the layout.&lt;/p&gt;&lt;h3&gt;Layout Options&lt;/h3&gt;&lt;p&gt;Davies&amp;rsquo; word cloud generator offers some useful ways to modify the layout using the controls below the graph. After modifying the settings, you can re-generate the word cloud by clicking anywhere on the graph. Each of the settings is described in detail below:&lt;/p&gt;&lt;h4&gt;&lt;u&gt;Spiral&lt;/u&gt;&lt;/h4&gt;&lt;p&gt;This refers to the method of calculating the angles and placement of terms in the layout. The &lt;strong&gt;Archimedean&lt;/strong&gt; setting uses the &lt;a href="https://en.wikipedia.org/wiki/Archimedean_spiral"&gt;&lt;/a&gt;&lt;a target="_blank" href="https://en.wikipedia.org/wiki/Archimedean_spiral"&gt;Archimedean spiral&lt;/a&gt; to determine the layout. The &lt;strong&gt;Rectangular&lt;/strong&gt; setting attempts to place terms within a rectangular shape.&lt;/p&gt;&lt;h4&gt;&lt;u&gt;Scale&lt;/u&gt;&lt;/h4&gt;&lt;p&gt;This refers to how individual terms are sized relative to one another in the word cloud. Settings are &lt;code&gt;log n&lt;/code&gt; (logarithmic scale), &lt;code&gt;&amp;radic;n&lt;/code&gt; (square root scale), and &lt;code&gt;n&lt;/code&gt;, where &lt;code&gt;n&lt;/code&gt; refers to the number of times an individual term occurs. &lt;code&gt;log n&lt;/code&gt; and &lt;code&gt;&amp;radic;n&lt;/code&gt; are methods of transforming this number based on the possible minimum and maximum values. No single scaling is inherently superior to the others, but they will produce different effects in the layout. Using the &lt;code&gt;n&lt;/code&gt; scale setting will preserve the original proportionality of the values as far as possible. &lt;code&gt;log n&lt;/code&gt; may aid the differentiation of data that is not uniformly distributed. The square root transformation will inflate smaller numbers but stabilize the size of larger ones.&lt;/p&gt;&lt;h4&gt;&lt;u&gt;Font&lt;/u&gt;&lt;/h4&gt;&lt;p&gt;You can change the appearance of your word cloud by setting the font here. This feature should work with any font installed on your system.&lt;/p&gt;&lt;h4&gt;&lt;u&gt;Orientation Settings&lt;/u&gt;&lt;/h4&gt;&lt;p&gt;In the middle of the &lt;strong&gt;Layout Options&lt;/strong&gt; controls is a form to set the number of different orientations terms can have in the layout. You can also set the range of angles, either by setting the number of degrees in the form fields or by dragging the angles in the image below them.&lt;/p&gt;&lt;h4&gt;&lt;u&gt;Number of Words&lt;/u&gt;&lt;/h4&gt;&lt;p&gt;By default, Lexos includes the top 250 terms in your documents. Use this setting to modify the number. Limiting the number of terms may help you to include high frequency terms which are dropped by the layout algorithm.&lt;/p&gt;&lt;h4&gt;&lt;u&gt;Download&lt;/u&gt;&lt;/h4&gt;&lt;p&gt;Word clouds are downloadable in either SVG or PNG format. SVG images are very useful because they scale well in web browsers. If you click the SVG button, a new window will open with a copy of your word cloud. Use your browser&amp;rsquo;s &lt;strong&gt;Save as&amp;hellip;&lt;/strong&gt; function to save the web page. If you click the PNG button, the image will open in a new window. The procedure for saving a PNG image is not standard in all browsers, so follow the instructions you see on the screen.&lt;/p&gt;</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <scalar:continue_to_content_id>159678</scalar:continue_to_content_id>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-08-15T10:17:23+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:832371"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/word-cloud"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="urn:scalar:path:1817392:834854:11">
    <scalar:urn rdf:resource="urn:scalar:path:1817392:834854:11"/>
    <oac:hasBody rdf:resource="http://scalar.usc.edu/works/lexos/manual.10"/>
    <oac:hasTarget rdf:resource="http://scalar.usc.edu/works/lexos/multicloud.6#index=11"/>
    <rdf:type rdf:resource="http://www.openannotation.org/ns/Annotation"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/multicloud">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/6902"/>
    <dcterms:created>2015-06-04T10:08:03+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:159679"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/multicloud.6"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/multicloud.6"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/multicloud.6">
    <ov:versionnumber>6</ov:versionnumber>
    <dcterms:title>The Multicloud Tool</dcterms:title>
    <dcterms:description>Manual page for the Lexos Multicloud tool</dcterms:description>
    <sioc:content>&lt;p&gt;Word clouds are a method of visualizing the &lt;strong&gt;Document-Term Matrix&lt;/strong&gt;. They present terms arranged at angles for compactness, with each term sized according to its frequency within the text. Word clouds enable you to get a sense of the content in your corpus, and they are very good for presentations. However, they also have some well-known limitations (see the topics article on &lt;a href=""&gt;visualizing texts with word clouds&lt;/a&gt;). In some languages, individual tokens may not correspond to words, which will limit the usefulness of this method of visualization.&lt;/p&gt;&lt;p&gt;The Lexos &lt;strong&gt;Multicloud&lt;/strong&gt; tool uses allows you to use word clouds to generate a comparative visualization of your documents. Each document has a separate word cloud, displayed in a grid that allows you to &amp;ldquo;eyeball&amp;rdquo; similarities and differences between the proportions of terms in each document. Because the word clouds are necessarily small, they may not always be easy to interpret. If you find interesting documents using the &lt;strong&gt;Multicloud&lt;/strong&gt; tool, it is recommended that you also generate larger clouds of each individual document using the &lt;strong&gt;Word Cloud&lt;/strong&gt; tool.&lt;/p&gt;&lt;p&gt;Word clouds are generated from lists (vectors) of term and count values, where the proportion of the overall counts determines the size of the terms in the layout. This method of visualizing term vectors is useful as a basic strategy for interpreting the results of &lt;strong&gt;topic models&lt;/strong&gt;, a common method of performing computational textual analysis not offered in Lexos. In the Humanities, the &lt;a href="http://mallet.cs.umass.edu/"&gt;&lt;/a&gt;&lt;a target="_blank" href="http://mallet.cs.umass.edu/"&gt;&lt;/a&gt;&lt;a href="http://mallet.cs.umass.edu/"&gt;MALLET&lt;/a&gt; topic modeling tool is commonly used, but it does not produce output that is easily visualized. The Lexos &lt;strong&gt;Multicloud&lt;/strong&gt; tool allows you to upload MALLET data and convert each topic into a Lexos document, from which you can generate word clouds or perform other operations such as cluster analysis. For more information on topic modeling and MALLET, see the separate article on &lt;a href=""&gt;Visualizing MALLET topic models&lt;/a&gt;.&lt;/p&gt;&lt;h3&gt;Generating and Using Multiclouds&lt;/h3&gt;&lt;p&gt;Before starting, choose whether you will be using your active documents (the default) or whether you will be uploading MALLET data to produce &amp;ldquo;topic clouds&amp;rdquo;. If you are using your active documents, select some or all of them using the &lt;strong&gt;Select Document(s)&lt;/strong&gt; check box. Then click the &lt;strong&gt;Get Graphs&lt;/strong&gt; button. Please keep in mind that generating multiclouds is computationally intensive and may take a very long time if you are generating a large number of clouds. Your browser may hang for some time. Please be patient. Once the graphs are generated, you can run your mouse cursor over each term in the word cloud to generate a tooltip showing the number of times the term occurs in the document.&lt;/p&gt;&lt;p&gt;If you wish to upload MALLET data, click the toggle switch showing &amp;ldquo;Document Clouds&amp;rdquo;, and it will change to &amp;ldquo;Topic Clouds&amp;rdquo;. Then click the &lt;strong&gt;Upload File&lt;/strong&gt; button to select your MALLET data file. Lexos will read either MALLET&amp;rsquo;s &lt;code&gt;words-topic-counts-file&lt;/code&gt; or its &lt;code&gt;output_state&lt;/code&gt; file. Wherever possible, it is best ot use the &lt;code&gt;words-topic-counts-file&lt;/code&gt; because it is smaller and easier to parse. Irregularities in the MALLET output may prevent Lexos from accurately parsing the data, and this will produce an error. See the separate article on &lt;a href=""&gt;Visualizing MALLET topic models&lt;/a&gt; for help in troubleshooting this issue. Once you have selected your MALLET data file, click &lt;strong&gt;Get Graphs&lt;/strong&gt;, and Lexos will generate word clouds of each topic in the topic model. You may also click the &lt;strong&gt;Convert topics to documents&lt;/strong&gt; checkbox. If you do this, the topic will appear as a document in the &lt;strong&gt;Manage&lt;/strong&gt; tool, and you will be able to use it in other Lexos tools.&lt;/p&gt;&lt;h3&gt;Rearranging Multiclouds&lt;/h3&gt;&lt;p&gt;If you generate more than 6 word clouds, you may find that you cannot view them all on the screen at once. In order to facilitate the comparatison of multiple word clouds, the &lt;strong&gt;Multicloud&lt;/strong&gt; tool allows you to drag and drop individual clouds into new positions in the grid. For instance, you can drag cloud 20 to the top of the grid in order to view it next to cloud 1.&lt;/p&gt;&lt;h3&gt;Exporting Multiclouds&lt;/h3&gt;&lt;p&gt;The &lt;strong&gt;Multicloud&lt;/strong&gt; tool does not yet have an export function, so the best way to save the &lt;strong&gt;Multicloud&lt;/strong&gt; display is with a screen shot. Some screen capture programs such as &lt;a target="_blank" href="http://getfireshot.com/"&gt;Fireshot&lt;/a&gt; allow you to capture the entire page, rather than just the visible area, which will allow you to save an image of all the multiclouds. In Mac OSX, you can click &lt;code&gt;Command-Option-F&lt;/code&gt; for full screen followed by &lt;code&gt;Command-Option-3&lt;/code&gt; or &lt;code&gt;Command-Option-4&lt;/code&gt;.&lt;/p&gt;</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <scalar:continue_to_content_id>159679</scalar:continue_to_content_id>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-08-16T20:42:50+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:834854"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/multicloud"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="urn:scalar:path:1817392:834855:12">
    <scalar:urn rdf:resource="urn:scalar:path:1817392:834855:12"/>
    <oac:hasBody rdf:resource="http://scalar.usc.edu/works/lexos/manual.10"/>
    <oac:hasTarget rdf:resource="http://scalar.usc.edu/works/lexos/bubbleviz.4#index=12"/>
    <rdf:type rdf:resource="http://www.openannotation.org/ns/Annotation"/>
  </rdf:Description>

  <rdf:Description rdf:about="urn:scalar:path:1817392:835026:13">
    <scalar:urn rdf:resource="urn:scalar:path:1817392:835026:13"/>
    <oac:hasBody rdf:resource="http://scalar.usc.edu/works/lexos/manual.10"/>
    <oac:hasTarget rdf:resource="http://scalar.usc.edu/works/lexos/statistics.7#index=13"/>
    <rdf:type rdf:resource="http://www.openannotation.org/ns/Annotation"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/statistics">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-08-15T17:25:58+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:314548"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/statistics.7"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/statistics.7"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/statistics.7">
    <ov:versionnumber>7</ov:versionnumber>
    <dcterms:title>The Statistics Tool</dcterms:title>
    <dcterms:description>Manual page for the Lexos Statistics tool</dcterms:description>
    <sioc:content>&lt;p&gt;The Lexos &lt;strong&gt;Statistics&lt;/strong&gt; tool provides a basic overview of statistical content in your collection, in addition to the specific term counts/proportions&amp;nbsp;available in the Document-Term Matrix (DTM) provided in &lt;strong&gt;Tokenizer&lt;/strong&gt;.&lt;/p&gt;&lt;p&gt;&lt;strong&gt;Statistics&lt;/strong&gt; generates a table containing the number of distinct terms, the number of terms occurring once (&lt;em&gt;hapax legomena&lt;/em&gt;), the total term count, and the average term frequency in each document. You may generate statistics on all of your active files or you may select a subset of your active documents by using the &lt;strong&gt;Select Document(s)&lt;/strong&gt; checkboxes. All of the &lt;a href="advanced-options"&gt;Advanced Options&lt;/a&gt; for manipulating the Document-Term Matrix (DTM) are available. When you have chosen your settings, click the &lt;strong&gt;Generate Statistics&lt;/strong&gt; button.&lt;/p&gt;&lt;h3&gt;Using the Statistics Table&lt;/h3&gt;&lt;p&gt;The statistics table may be sorted by column by clicking on the column headers. An icon will indicate which column is being used for sorting and whether the sort direction is ascending or descending. &lt;span style="line-height: 20.8px;"&gt;(Note: the first click will sort that column in increasing order; click again to sort in decreasing order.)&amp;nbsp;&lt;/span&gt;Use the &lt;strong&gt;Display&lt;/strong&gt; dropdown menu to display more than the default 10 rows per page. The statistics table may be copied to your computer&amp;#39;s clipboard by clicking the &lt;strong&gt;Copy&lt;/strong&gt; button. It may also be downloaded as an Excel spreadsheet, Comma-Separated Values (CSV) file, Tab-Separated Values (TSV) file, or a PDF.&lt;/p&gt;&lt;h3&gt;Statistics for the Entire Corpus&lt;/h3&gt;When you generate the statistics table, Lexos also calculates the average, median, and interquartile range (&lt;a target="_blank" href="https://en.wikipedia.org/wiki/Interquartile_range"&gt;IQR&lt;/a&gt;) of your documents&amp;#39; sizes (based on term counts). This information is used to determine if any of the document sizes are anomalously large or small, that is, if any of your document sizes are outliers. Outliers are those document sizes that fall below Q1 - 1.5(IQR) or above Q3 + 1.5(IQR). Following a display of the average and median values, Lexos provides a warning for each document with a size that is particularly large or small compared to the rest of your corpus. You should consider removing outlier documents from subsequent analyses and/or consider additional cutting of some documents to make term counts more uniform.</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3689"/>
    <dcterms:created>2016-08-17T13:33:16+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:835026"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/statistics"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="urn:scalar:path:1817392:839333:14">
    <scalar:urn rdf:resource="urn:scalar:path:1817392:839333:14"/>
    <oac:hasBody rdf:resource="http://scalar.usc.edu/works/lexos/manual.10"/>
    <oac:hasTarget rdf:resource="http://scalar.usc.edu/works/lexos/hierarchical-clustering.17#index=14"/>
    <rdf:type rdf:resource="http://www.openannotation.org/ns/Annotation"/>
  </rdf:Description>

  <rdf:Description rdf:about="urn:scalar:path:1817392:839366:15">
    <scalar:urn rdf:resource="urn:scalar:path:1817392:839366:15"/>
    <oac:hasBody rdf:resource="http://scalar.usc.edu/works/lexos/manual.10"/>
    <oac:hasTarget rdf:resource="http://scalar.usc.edu/works/lexos/kmeans.4#index=15"/>
    <rdf:type rdf:resource="http://www.openannotation.org/ns/Annotation"/>
  </rdf:Description>

  <rdf:Description rdf:about="urn:scalar:path:1817392:834931:16">
    <scalar:urn rdf:resource="urn:scalar:path:1817392:834931:16"/>
    <oac:hasBody rdf:resource="http://scalar.usc.edu/works/lexos/manual.10"/>
    <oac:hasTarget rdf:resource="http://scalar.usc.edu/works/lexos/similarity-query.35#index=16"/>
    <rdf:type rdf:resource="http://www.openannotation.org/ns/Annotation"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/similarity-query">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/6902"/>
    <dcterms:created>2015-06-04T10:13:04+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:159684"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/similarity-query.35"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/similarity-query.35"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/similarity-query.35">
    <ov:versionnumber>35</ov:versionnumber>
    <dcterms:title>The Similarity Query Tool</dcterms:title>
    <dcterms:description>Manul page for the Lexos Similarity Query Tool</dcterms:description>
    <sioc:content>Similarity Query is a good choice for an early exploration when you wish to &lt;strong&gt;rank&lt;/strong&gt; the &amp;quot;closeness&amp;quot; between a single document and all other documents in your active set. As used here, the rankings are determined by &amp;quot;distance between documents&amp;quot;, where small distances (near zero) represent documents that are &amp;quot;similar&amp;quot; and unlike documents have distances closer to one. Similarity Query, as implemented here, is a variant of &lt;a target="_blank" href="https://en.wikipedia.org/wiki/Cosine_similarity"&gt;Cosine Similarity&lt;/a&gt;.&lt;h3&gt;Getting the Results of Similarity Query&lt;/h3&gt;&lt;ol&gt;&lt;li&gt;On the left, select the radio button for the one document to serve as the comparison document. All other active documents will be compared to this document.&lt;/li&gt;&lt;li&gt;In the panel on the right, you may configure the &lt;a href="advanced-options"&gt;Advanced Options&lt;/a&gt; for manipulating the Document-Term Matrix (DTM). Note: cosine similarity always uses proportions of tokens so no Normalization options are available here.&lt;/li&gt;&lt;li&gt;Select the green &lt;strong&gt;Get Similarity Rankings&lt;/strong&gt; button. The results will be shown below in a table, which may be sorted by column by clicking on the column headers.&amp;nbsp;An icon will indicate which column is being used for sorting and whether the sort direction is ascending or descending.&amp;nbsp;&lt;span style="line-height: 20.8px;"&gt;(Note: the first click will sort that column in increasing order; click again to sort in decreasing order.)&lt;/span&gt;&amp;nbsp;On the top-left of the table, use&amp;nbsp;the &lt;strong&gt;Display&lt;/strong&gt; dropdown menu to show more than the default 10 rows per page. Note that if you change your comparison document, you must click the &lt;strong&gt;Get Similarity Rankings&lt;/strong&gt; button again. If you have documents on multiple pages, you can quickly search for a document by typing the first few letters of the document name in the Search field in the top-right of the table.&lt;/li&gt;&lt;li&gt;The table can be downloaded as a comma-separated-value (CSV) file by clicking the blue &lt;strong&gt;Download Similarity CSV&lt;/strong&gt;. The file with all results will appear in your local Download directory/folder and may be opened in a spreadsheet program for further work.&lt;/li&gt;&lt;/ol&gt;</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <scalar:continue_to_content_id>159684</scalar:continue_to_content_id>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3689"/>
    <dcterms:created>2016-08-17T09:30:45+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:834931"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/similarity-query"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="urn:scalar:path:1817392:839393:17">
    <scalar:urn rdf:resource="urn:scalar:path:1817392:839393:17"/>
    <oac:hasBody rdf:resource="http://scalar.usc.edu/works/lexos/manual.10"/>
    <oac:hasTarget rdf:resource="http://scalar.usc.edu/works/lexos/topwords.12#index=17"/>
    <rdf:type rdf:resource="http://www.openannotation.org/ns/Annotation"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/topwords">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/6902"/>
    <dcterms:created>2015-06-04T10:13:48+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:159685"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/topwords.12"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/topwords.12"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/topwords.12">
    <ov:versionnumber>12</ov:versionnumber>
    <dcterms:title>The Topwords Tool</dcterms:title>
    <dcterms:description>Manual page for the Lexos Topwords tool</dcterms:description>
    <sioc:content>&lt;p&gt;The Lexos &lt;strong&gt;Topwords&lt;/strong&gt; tool allows you to ask what terms are more prominent in a certain document or class of documents than in other classes of documents or the collection as a whole. We call these highly prominent terms &amp;quot;topwords&amp;quot; (even when the term are not, strictly speaking, &amp;quot;words&amp;quot;.) The Topwords tool uses a &lt;a target="_blank" href="https://en.wikipedia.org/wiki/Z-test"&gt;Z-test&lt;/a&gt;, which determines which terms are outliers beyond the normal range of distribution in a document or a group of documents. The prominent terms you identify in &lt;strong&gt;Topwords&lt;/strong&gt; make good candidates for further analysis using the Lexos &lt;a href="http://scalar.usc.edu/works/lexos/manage"&gt;&lt;/a&gt;&lt;a href="rolling-windows"&gt;Rolling Windows&lt;/a&gt; tool.&lt;/p&gt;&lt;p&gt;&lt;strong&gt;Topwords&lt;/strong&gt; allows you to configure the criteria for determining the bounds of statistical prominence. Options for limiting the proportions to use in an analysis include commonly used metrics such as &lt;a target="_blank" href="https://en.wikipedia.org/wiki/Standard_deviation"&gt;Standard Deviation&lt;/a&gt; and &lt;a target="_blank" href="https://en.wikipedia.org/wiki/Interquartile_range"&gt;Interquartile Range (IQR)&lt;/a&gt;, as well the ability to use customizable bounds. &lt;strong&gt;Topwords&lt;/strong&gt; leverages the power of class labels to documents in the &lt;a href="manage"&gt;Manage&lt;/a&gt; tool. There, you can right-click and set the class on all active documents or set each document&amp;#39;s class individually. If you do not assign class labels to your active documents, &lt;strong&gt;Topwords&lt;/strong&gt; will only allow you to compare the proportions of each term in a single document to their proportions in the overall set of active documents in our collection.&lt;/p&gt;&lt;p&gt;Note that in&lt;strong&gt; Topwords&lt;/strong&gt;, documents and/or document classes should have at least 100 tokens each.&lt;/p&gt;&lt;h3&gt;Topwords Settings&lt;/h3&gt;&lt;p&gt;If you have not set any class labels in the &lt;strong&gt;Manage&lt;/strong&gt; tool, &lt;strong&gt;Topwords&lt;/strong&gt; will by default compare a single document to all the other active documents in your workspace. If your documents already contain class labels, you have two additional options: &lt;strong&gt;Compare each document against other class(es)&lt;/strong&gt; and &lt;strong&gt;Compare classes one to another&lt;/strong&gt;. The first allows you to compare each document to one or more classes. The second allows you to compare the classes themselves.&lt;/p&gt;&lt;p&gt;By default, Lexos will perform the analysis on all terms that appear in both groups (whether individual documents or groups of documents with class labels). However, you may also choose from two &lt;strong&gt;Built-in Options&lt;/strong&gt;. These are &lt;strong&gt;Standard Deviation&lt;/strong&gt; and &lt;strong&gt;Interquartile Range&lt;/strong&gt;. Standard Deviation calculates the mean frequency for all terms and compares the frequency of individual terms to this baseline. You may choose to limit to the analysis to the &lt;strong&gt;Top Outliers Only&lt;/strong&gt; (the 5% most prominent terms above the mean), the &lt;strong&gt;Bottom Outliers Only&lt;/strong&gt; (the 5% least prominent terms below the mean), or the &lt;strong&gt;Non-Outliers Only&lt;/strong&gt; (the terms that fall within one standard deviation above or below the mean). Options are similar for &lt;strong&gt;Interquartile Range&lt;/strong&gt;, except the distribution of terms is divided into first, second, and third &lt;em&gt;quartiles&lt;/em&gt; with the second quartile occupying the middle range. This essentially sets a slightly different set of cut-off boundaries from standard deviation. Lexos also allows you to set your own boundaries by selecting &lt;strong&gt;Customize Options&lt;/strong&gt;. You may base your boundaries on the &lt;strong&gt;Proportional Counts&lt;/strong&gt; (that is, the proportion of the documents assigned to each term) or the &lt;strong&gt;Raw Counts&lt;/strong&gt; of each term. Setting the &lt;strong&gt;Upper Boundary&lt;/strong&gt; and &lt;strong&gt;Lower Boundary&lt;/strong&gt; is equivalent to basing the analysis only the terms with proportions/counts in between these settings.&lt;/p&gt;&lt;h3&gt;How to Read the Results&lt;/h3&gt;&lt;p&gt;&lt;strong&gt;Topwords&lt;/strong&gt; produces a series of tables, each showing a different comparison (labeled at the top of the table). Within each table, the terms are ranked according to their Z-score. Only the top 20 statistically significant terms (those for which the Z-score has an absolute value larger than 1.96) are shown. A larger positive Z-score indicates a term in this document or class is used more frequently than in the comparison group. A larger negative Z-score indicates a term that is used relatively rarely. Note that Topwords assumes the distribution of term frequencies is a normal distribution, which is not the case for most data, so the results should be used with caution.&lt;/p&gt;</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-08-22T17:40:12+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:839393"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/topwords"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/media/1_macosxinstallguide">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Media"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3689"/>
    <dcterms:created>2015-08-11T19:17:38+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:172426"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/media/1_macosxinstallguide.1"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/media/1_macosxinstallguide.1"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/media/1_macosxinstallguide.1">
    <ov:versionnumber>1</ov:versionnumber>
    <dcterms:title>MacOS Install Directions</dcterms:title>
    <dcterms:description>Lexos MacOS Install Directions</dcterms:description>
    <art:url rdf:resource="http://scalar.usc.edu/works/lexos/media/1_MacOSXInstallGuide.pdf"/>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3689"/>
    <dcterms:created>2015-08-11T19:17:38+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:430988"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/media/1_macosxinstallguide"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/media/beoeththorn1000wordave">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Media"/>
    <scalar:isLive>1</scalar:isLive>
    <art:thumbnail rdf:resource="http://scalar.usc.edu/works/lexos/media/BeoEthThorn1000WordAve_thumb.JPG"/>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2015-07-11T09:18:15+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:166048"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/media/beoeththorn1000wordave.2"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/media/beoeththorn1000wordave.2"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/media/beoeththorn1000wordave.2">
    <ov:versionnumber>2</ov:versionnumber>
    <dcterms:title>Rolling Window graph of average &amp;eth; and &amp;thorn; frequencies in a window of 1000 words</dcterms:title>
    <dcterms:description>Rolling Window graph of average eth and thorn frequencies in a window of 1000 words</dcterms:description>
    <art:url rdf:resource="http://scalar.usc.edu/works/lexos/media/BeoEthThorn1000WordAve.JPG"/>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2015-07-11T09:20:10+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:415053"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/media/beoeththorn1000wordave"/>
    <dcterms:isReferencedBy rdf:resource="http://scalar.usc.edu/works/lexos/rolling-window-analysis"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/rolling-window-analysis">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2015-07-10T21:29:07+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:165905"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/rolling-window-analysis.11"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/rolling-window-analysis.11"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/rolling-window-analysis.11">
    <ov:versionnumber>11</ov:versionnumber>
    <dcterms:title>Rolling Window Analysis</dcterms:title>
    <sioc:content>Rolling window analysis is a method of tracing the frequency of features within a designated window of tokens over the course of a document. It can be used to identify small- and large-scale patterns of usage of individual features or to compare these patterns for multiple features.
&amp;nbsp;
A typical issue in text analysis, especially cluster analysis, is the question of how many documents should be used and how large they should be. Ultimately, one must adopt a system of trial and error to find a “sweet spot” which produces the most meaningful results. This challenge can arise even if whole texts are used, but it is felt most acutely when cutting texts into smaller segments. The result may yield meaningful patterns between segments, but it may also obscure smaller meaningful patterns both within and across them. For methods of manipulating segment size to detect such patterns, see &lt;a href="establishing-robust-clusters" data-display-content-preview-box="true"&gt;Identifying Robust Clusters&lt;/a&gt;.

Rolling window analysis is another method of identifying smaller patterns that can be used in conjunction with cluster analysis or on its own. Unlike cluster analysis, rolling window analysis does not require that we pre-specify boundaries—in fact, the technique allows us to identify possible boundaries or segments within texts. Rolling window analysis tabulates token frequency not in discrete segments but as part of a continuously moving metric. Beginning with the selection of a window, say 100 tokens, rolling window analysis traces the frequency of a token within tokens 1-100, then 2 to 101, then 3, 102, and so on until the end of the document is reached. The result can be plotted as a line graph so that it is possible to observe gradual changes in a token’s frequency as the text progresses. Plotting different tokens on the same graph allows us to compare their frequencies. For instance, the following graph compares the letters &lt;i&gt;þ&lt;/i&gt; and &lt;i&gt;ð&lt;/i&gt; in the Old English poem &lt;i&gt;Beowulf&lt;/i&gt;.

&lt;a class="inline" href="media/BeoEthThorn1000WordAve.JPG" resource="media/beoeththorn1000wordave"&gt;&lt;/a&gt;&amp;nbsp;
The greater preponderance of &lt;i&gt;ð&lt;/i&gt; in the second half of the poem is evidence of a change in scribe, but the many variations that occur in both halves may correlate to other phenomena such as different source material. Variations in the use of spellings with &lt;i&gt;þ&lt;/i&gt; and &lt;i&gt;ð&lt;/i&gt; in the Old English helped inspire the development of rolling windows analysis as a Lexomic method and a Lexos tool. To find out more with an extended case study, see &lt;a href="#"&gt;Rolling Windows Analysis of Old English Orthography&lt;/a&gt;.

Rolling windows analysis can be performed on any type of token, including n-grams of characters or words, or even whole lines of text. Different resolutions can be achieved by changing the size of the token window. In addition, different metrics for tabulating the data may be used. Lexos currently provides two:

&lt;b&gt;Rolling Window Average:&lt;/b&gt; The number of times a specific token appears in the window, divided by the overall size of the window.

&lt;b&gt;Rolling Window Ratio:&lt;/b&gt; The number of times a specific token appears divided by the sum of the appearances of the token and a second token. This metric can be used for comparing two mutually exclusive features.

By default, Lexos performs a forward-looking window analysis, in which the metric for the final section of tokens is not calculable. For example, if we are using a 100-token window size, we will not be able to compare the average frequency of a specific token in the final 99 tokens because the window size will of necessity be less than the window size used for the rest of the text. Hence a number of tokens equivalent to the window size minus 1 will be omitted from the analysis. Theoretically, it is possible to reverse this using a backward-looking window that begins at the end of the text and moves towards the beginning. In this case, we would lose the beginning tokens equivalent to the window size minus 1. We can also use a centered window by starting the window in the middle at a point equivalent to the window size divided by two. This method leaves undefined sections at both the beginning and the end of the text, but these segments are each only half the size of those generated by the forward- or backward-looking windows. Currently, Lexos does not enable the use of backward-looking or centered windows. To achieve these effects, it is necessary to cut the documents and re-order the resulting segments. In practice, forward-looking windows are typically sufficient unless one is particularly interested in the end of the text.

&lt;h3&gt;Selecting a Window Size&lt;/h3&gt;
The window size should be significantly smaller than the total number of tokens to be examined. You can quickly find out the total count for a token in the &lt;a href="#"&gt;Statistics&lt;/a&gt; tool. The window size limits the resolution of the analysis, allowing us to localize anomalies only to within the given window in which they appear. Smaller windows can improve resolution, but, because the &lt;a href="https://en.wikipedia.org/wiki/Standard_deviation"&gt;standard deviation&lt;/a&gt; of the data increases as the window size decreases, small windows have the potential to amplify the influence of random variations. Larger window sizes tend to smooth the data, eliminating random fluctuations but potentially obscuring smaller features.

We are not yet able to determine the optimal window size for a given text in the abstract, but a rule of thumb that has worked reasonably well when comparing experiments to control texts has been to use windows of between 100-500 words (approximately 20-100 lines in poetry), with a preference towards larger windows for longer texts.

The undefined (unexamined) sections at the end of the document in rolling window analyses should be taken into consideration when choosing window size. For instance, in a 3000 line poem, a rolling analysis using a 100-line window would cause 3% of the information at the end of the text to be excluded. In a 750-line poem, such a large window size (100-line window) would obscure 13% of the poem. It might also be true that interpolated or intertextual sections are less extensive in shorter texts, and to detect them we need to apply a smaller window.

&lt;b&gt;Not yet added to the path:
&lt;/b&gt;&lt;ol&gt;&lt;li&gt;Instructions for using the Rolling Window Graph tool in Lexos.&lt;/li&gt;&lt;/ol&gt;</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2015-08-12T17:02:20+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:431368"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/rolling-window-analysis"/>
    <dcterms:references rdf:resource="http://scalar.usc.edu/works/lexos/media/beoeththorn1000wordave"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/media/browse">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Media"/>
    <scalar:isLive>1</scalar:isLive>
    <art:thumbnail rdf:resource="http://scalar.usc.edu/works/lexos/media/browse_thumb.png"/>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3689"/>
    <dcterms:created>2015-08-29T20:25:46+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:176740"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/media/browse.1"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/media/browse.1"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/media/browse.1">
    <ov:versionnumber>1</ov:versionnumber>
    <dcterms:title>Browse</dcterms:title>
    <dcterms:description>Browse button</dcterms:description>
    <art:url rdf:resource="http://scalar.usc.edu/works/lexos/media/browse.png"/>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3689"/>
    <dcterms:created>2015-08-29T20:25:46+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:439069"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/media/browse"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/media/linux-install-directions">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Media"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3689"/>
    <dcterms:created>2015-08-11T19:21:01+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:172428"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/media/linux-install-directions.1"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/media/linux-install-directions.1"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/media/linux-install-directions.1">
    <ov:versionnumber>1</ov:versionnumber>
    <dcterms:title>Linux Install Directions</dcterms:title>
    <dcterms:description>Lexos Linux Install Directions</dcterms:description>
    <art:url rdf:resource="http://scalar.usc.edu/works/lexos/media/1_LinuxInstallGuide.pdf"/>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3689"/>
    <dcterms:created>2015-08-11T19:21:01+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:430995"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/media/linux-install-directions"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/media/macos-install-directions">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Media"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3689"/>
    <dcterms:created>2015-08-11T19:26:48+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:172429"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/media/macos-install-directions.1"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/media/macos-install-directions.1"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/media/macos-install-directions.1">
    <ov:versionnumber>1</ov:versionnumber>
    <dcterms:title>MacOS Install Directions</dcterms:title>
    <dcterms:description>Lexos MacOS  Install Directions</dcterms:description>
    <art:url rdf:resource="http://scalar.usc.edu/works/lexos/media/1_MacOSXInstallGuide.pdf"/>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3689"/>
    <dcterms:created>2015-08-11T19:26:48+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:431002"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/media/macos-install-directions"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/media/reset">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Media"/>
    <scalar:isLive>1</scalar:isLive>
    <art:thumbnail>http://scalar.usc.edu/works/lexos/media/Screen Shot 2015-08-29 at 3.28.33 PM_thumb.png</art:thumbnail>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3689"/>
    <dcterms:created>2015-08-29T19:32:54+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:176736"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/media/reset.2"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/media/reset.2"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/media/reset.2">
    <ov:versionnumber>2</ov:versionnumber>
    <dcterms:title>Reset</dcterms:title>
    <art:url>http://scalar.usc.edu/works/lexos/media/Screen Shot 2015-08-29 at 3.28.33 PM.png</art:url>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3689"/>
    <dcterms:created>2015-08-29T19:38:07+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:439041"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/media/reset"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/media/uploaded-files">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Media"/>
    <scalar:isLive>1</scalar:isLive>
    <art:thumbnail>http://scalar.usc.edu/works/lexos/media/Screen Shot 2015-08-19 at 1.57.54 PM_thumb.png</art:thumbnail>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3689"/>
    <dcterms:created>2015-08-19T18:01:39+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:174542"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/media/uploaded-files.1"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/media/uploaded-files.1"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/media/uploaded-files.1">
    <ov:versionnumber>1</ov:versionnumber>
    <dcterms:title>Uploaded Files</dcterms:title>
    <dcterms:description>Filename and sizes are shown on the screen once successfully uploaded by Lexos</dcterms:description>
    <art:url>http://scalar.usc.edu/works/lexos/media/Screen Shot 2015-08-19 at 1.57.54 PM.png</art:url>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3689"/>
    <dcterms:created>2015-08-19T18:01:39+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:434945"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/media/uploaded-files"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/media/windows-install-directions">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Media"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3689"/>
    <dcterms:created>2015-08-11T19:19:29+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:172427"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/media/windows-install-directions.1"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/media/windows-install-directions.1"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/media/windows-install-directions.1">
    <ov:versionnumber>1</ov:versionnumber>
    <dcterms:title>Windows Install Directions</dcterms:title>
    <dcterms:description>Lexos Windows Install Directions</dcterms:description>
    <art:url rdf:resource="http://scalar.usc.edu/works/lexos/media/1_WindowsInstallGuide.pdf"/>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3689"/>
    <dcterms:created>2015-08-11T19:19:29+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:430991"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/media/windows-install-directions"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/media/workspace">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Media"/>
    <scalar:isLive>1</scalar:isLive>
    <art:thumbnail>http://scalar.usc.edu/works/lexos/media/Screen Shot 2015-08-29 at 3.28.22 PM_thumb.png</art:thumbnail>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3689"/>
    <dcterms:created>2015-08-29T19:33:15+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:176737"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/media/workspace.1"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/media/workspace.1"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/media/workspace.1">
    <ov:versionnumber>1</ov:versionnumber>
    <dcterms:title>Workspace</dcterms:title>
    <art:url>http://scalar.usc.edu/works/lexos/media/Screen Shot 2015-08-29 at 3.28.22 PM.png</art:url>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3689"/>
    <dcterms:created>2015-08-29T19:33:16+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:439037"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/media/workspace"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="urn:scalar:path:834854:585353:1">
    <scalar:urn rdf:resource="urn:scalar:path:834854:585353:1"/>
    <oac:hasBody rdf:resource="http://scalar.usc.edu/works/lexos/multicloud.6"/>
    <oac:hasTarget rdf:resource="http://scalar.usc.edu/works/lexos/multicloud-tips-and-tricks.1#index=1"/>
    <rdf:type rdf:resource="http://www.openannotation.org/ns/Annotation"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/multicloud-tips-and-tricks">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-01-14T11:39:55+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:224367"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/multicloud-tips-and-tricks.1"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/multicloud-tips-and-tricks.1"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/multicloud-tips-and-tricks.1">
    <ov:versionnumber>1</ov:versionnumber>
    <dcterms:title>Multicloud Tips and Tricks</dcterms:title>
    <dcterms:description>Some ways to get the most out of Multicloud</dcterms:description>
    <sioc:content>Typically, only two or three rows of word clouds produced by Multicloud are visible on the screen. To see more clouds, you must scroll down, hiding the ones at the top. If you wish to compare clouds that are far apart, you can drag and drop the clouds you are interested in to get them to appear closer together. Although useful, this drag and drop method is not always ideal, particularly if you wish to use the word clouds in an external presentation. This article provides some tips for how to get manipulate the Lexos Multicloud output.&lt;br /&gt;&lt;br /&gt;Currently, Lexos does not have a function to export the word clouds produced by Multicloud, and screen shots will only capture the visible clouds. Luckily, most browsers have extensions or plugins that will allow you to capture a complete web page as an image. A good example is for Firefox is &lt;a target="_blank" href="http://getfireshot.com/"&gt;Fireshot&lt;/a&gt;, but others are available for Firefox and other browsers. Once you have captured the full web page as an image, you can use an image editor to crop the image so that it contains just the word clouds.&lt;br /&gt;&lt;br /&gt;The next step is to divide the image into smaller images. For presentations, it might be desirable to have each row of three word clouds as a separate image. You could take individual screen shots of each row, but this article will outline another method using the &lt;a target="_blank" href="https://www.gimp.org/"&gt;GIMP&lt;/a&gt; image editor. GIMP has a tool called &lt;a target="_blank" href="http://docs.gimp.org/en/gimp-concepts-image-guides.html"&gt;Guides&lt;/a&gt; which allows you to designated boundaries for slicing up the image. If you have an image with ten rows of word clouds, you can insert nine guides between them. (It is also possible to insert vertical guides to divide individual clouds in each row.) You can then use GIMP&amp;#39;s &lt;a target="_blank" href="http://docs.gimp.org/en/python-fu-slice.html"&gt;Slice&lt;/a&gt; function to slice up the image into ten smaller images, one for each row. As a bonus, the Slice tool produces an html file with a table containing each of the smaller images. This is a good starting point for using them in a web presentation. However, the smaller images can also be inserted into tools like word processors or slideshow presentations. If you don&amp;#39;t use GIMP, your chosen image editor may have similar tools.</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-01-14T11:39:55+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:585353"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/multicloud-tips-and-tricks"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/other-topic-paths">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2015-08-19T15:24:23+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:174506"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/other-topic-paths.2"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/other-topic-paths.2"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/other-topic-paths.2">
    <ov:versionnumber>2</ov:versionnumber>
    <dcterms:title>Other Topic Paths</dcterms:title>
    <dcterms:description>Running list of suggested topic paths.</dcterms:description>
    <sioc:content>The structure of the Lexos workflow does not perfectly coincide with the numerous possible topics that we can explore. This page keeps a running list of possible topics to be added to the Topic path:&lt;ol&gt;&lt;li&gt;Statistics for stylistic analysis&lt;/li&gt;&lt;li&gt;Authorial attribution&lt;/li&gt;&lt;/ol&gt;What are topics should we include? Add to the list.</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-08-25T14:09:23+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:840506"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/other-topic-paths"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/pre-processing">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2015-08-19T01:33:25+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:174490"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/pre-processing.5"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/pre-processing.5"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/pre-processing.5">
    <ov:versionnumber>5</ov:versionnumber>
    <dcterms:title>Pre-Processing</dcterms:title>
    <dcterms:description>The main pre-processing path</dcterms:description>
    <sioc:content>Pre-processing is our path for topics related to the &amp;quot;Scrubbing&amp;quot; process. This path has not yet been developed.</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-08-22T17:41:24+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:839395"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/pre-processing"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="urn:scalar:path:839395:839396:1">
    <scalar:urn rdf:resource="urn:scalar:path:839395:839396:1"/>
    <oac:hasBody rdf:resource="http://scalar.usc.edu/works/lexos/pre-processing.5"/>
    <oac:hasTarget rdf:resource="http://scalar.usc.edu/works/lexos/scrubbing.7#index=1"/>
    <rdf:type rdf:resource="http://www.openannotation.org/ns/Annotation"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/scrubbing">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2015-08-19T01:36:37+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:174491"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/scrubbing.7"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/scrubbing.7"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/scrubbing.7">
    <ov:versionnumber>7</ov:versionnumber>
    <dcterms:title>Scrubbing</dcterms:title>
    <dcterms:description>The main path for Scrubbing topics</dcterms:description>
    <sioc:content>Preparing your texts for subsequent tokenization and analysis is a critical but essential step, what we refer to as &amp;quot;scrubbing&amp;quot; your texts. In order to facilitate a conscious consideration of the many small decisions required, scrubbing options are isolated into individual choices. If for no other reason, your careful deliberation and choice of the many options facilitates a replication of your analyses in the future, both by you and others who wish to verify your experiment.&lt;br /&gt;&lt;br /&gt;This path has not yet been developed.&lt;br /&gt;&lt;br style="line-height: 20.8px;" /&gt;&amp;nbsp;</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <scalar:continue_to_content_id>174491</scalar:continue_to_content_id>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-08-22T17:41:58+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:839396"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/scrubbing"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="urn:scalar:path:839395:839398:2">
    <scalar:urn rdf:resource="urn:scalar:path:839395:839398:2"/>
    <oac:hasBody rdf:resource="http://scalar.usc.edu/works/lexos/pre-processing.5"/>
    <oac:hasTarget rdf:resource="http://scalar.usc.edu/works/lexos/cutting.2#index=2"/>
    <rdf:type rdf:resource="http://www.openannotation.org/ns/Annotation"/>
  </rdf:Description>

  <rdf:Description rdf:about="urn:scalar:path:839395:839399:3">
    <scalar:urn rdf:resource="urn:scalar:path:839395:839399:3"/>
    <oac:hasBody rdf:resource="http://scalar.usc.edu/works/lexos/pre-processing.5"/>
    <oac:hasTarget rdf:resource="http://scalar.usc.edu/works/lexos/tokenisation.2#index=3"/>
    <rdf:type rdf:resource="http://www.openannotation.org/ns/Annotation"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/tokenisation">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2015-08-19T01:38:22+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:174493"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/tokenisation.2"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/tokenisation.2"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/tokenisation.2">
    <ov:versionnumber>2</ov:versionnumber>
    <dcterms:title>Tokenization</dcterms:title>
    <dcterms:description>The main path for tokenization topics</dcterms:description>
    <sioc:content>Tokenization topics go here.&lt;br /&gt;&lt;br /&gt;This path has not yet been developed.</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-08-22T17:43:25+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:839399"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/tokenisation"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="urn:scalar:path:431368:415896:1">
    <scalar:urn rdf:resource="urn:scalar:path:431368:415896:1"/>
    <oac:hasBody rdf:resource="http://scalar.usc.edu/works/lexos/rolling-window-analysis.11"/>
    <oac:hasTarget rdf:resource="http://scalar.usc.edu/works/lexos/rolling-window-formal-description.7#index=1"/>
    <rdf:type rdf:resource="http://www.openannotation.org/ns/Annotation"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/rolling-window-formal-description">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <scalar:customStyle>code{white-space: pre;}</scalar:customStyle>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2015-07-12T13:16:15+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:166466"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/rolling-window-formal-description.7"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/rolling-window-formal-description.7"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/rolling-window-formal-description.7">
    <ov:versionnumber>7</ov:versionnumber>
    <dcterms:title>Rolling Window - Formal Description</dcterms:title>
    <sioc:content>&lt;h2&gt;Formal Description of Rolling Window Analysis&lt;/h2&gt;
&lt;h3&gt;Rolling Averages&lt;/h3&gt;
&lt;p&gt;To calculate a rolling average, we first select a &lt;em&gt;window size&lt;/em&gt;, w, which is significantly smaller than T, the total number of units (e.g. characters, words, or lines) in the document. The first window in the document thus consists of units 1-w. We count the number of features of interest, n, within this first window and divide by the window size, giving us an average of the number of features per units (p = n/w). From this information, we produce a data pair comprised of the ordinal number of the window, k and the value of p (k, p&lt;sub&gt;k&lt;/sub&gt;). So for the first window, the resulting data-pair is (1, p&lt;sub&gt;1&lt;/sub&gt;). We then shift the window one unit towards the end of the text by one unit by incrementing both the initial and final units in the window by 1 (k+1, w+1), tabulate number of times the feature of interest appears in the shifted window, and calculate p&lt;sub&gt;2&lt;/sub&gt;=n&lt;sub&gt;2&lt;/sub&gt;/w, producing a new pair of data-points, (2, p&lt;sub&gt;2&lt;/sub&gt;). This process is repeated, moving the window through the text until the edge of the window meets the end of the text (i.e., where k+w=T), producing a set of k coordinates in the form (k, p&lt;sub&gt;k&lt;/sub&gt;).&lt;/p&gt;
&lt;p&gt;Formally, the value of p at any location k is equal to:&lt;/p&gt;
&lt;p&gt;&lt;span class="math display"&gt;\[{{p_{k} = \left\lbrack \left( \sum_{i = k}^{k + w}n_{} \right) \div \left( w \right) \right\rbrack,\ k + w \leq \ T}_{}}_{}\]&lt;/span&gt;&lt;/p&gt;
&lt;blockquote&gt;
where k is the ordinal number of the first unit in the window&lt;br&gt;
w is the size of the window in units&lt;br&gt;
n is the total number of features of interest in the window&lt;br&gt;
T is the total number of units in the text.
&lt;/blockquote&gt;
&lt;h3&gt;Rolling Ratios&lt;/h3&gt;
&lt;p&gt;To calculate a rolling ratio, we first select a &lt;em&gt;window size&lt;/em&gt;, w, which is significantly smaller than T, the total number of units (e.g. characters, words, or lines) in the document. The first window in the document thus consists of units 1-w. If we have two mutually exclusive features of interest, m and r, we can calculate their ratio, q, to each other by dividing the number of appearances of the first feature by the total number of both features in a given window: q = (m/m+r). From this information we produce a data pair comprised of k, the ordinal number of the window and the value of q. For the first window, therefore, the resulting data pair is (k, q&lt;sub&gt;1&lt;/sub&gt;). We then shift the window one unit towards the end of the text by incrementing both the initial and final units in the window by 1, tabulate the number of times the feature of interest appears in the shifted window, and calculate q&lt;sub&gt;2&lt;/sub&gt;=m&lt;sub&gt;2&lt;/sub&gt; /(r&lt;sub&gt;2+&lt;/sub&gt;m&lt;sub&gt;2&lt;/sub&gt;), producing a new pair of data-points, (2, q&lt;sub&gt;2&lt;/sub&gt;). This process is repeated, moving the window through the text until the leading edge of the window meets the end of the text (i.e., where k+w=T), producing a set of k coordinates in the form (k, q&lt;sub&gt;k&lt;/sub&gt;).&lt;/p&gt;
&lt;p&gt;Formally, the value of q at any location k is equal to:&lt;/p&gt;
&lt;p&gt;&lt;span class="math display"&gt;\[{{q_{k} = \sum_{i = k}^{k + w}m_{} \div \left( \sum_{i = k}^{k + w}{m_{} + \sum_{i = k}^{k + w}r_{}} \right),\ k + w \leq \ T}_{}}_{}\]&lt;/span&gt;&lt;/p&gt;
&lt;blockquote&gt;
where k is the ordinal number of the first unit in the window&lt;br&gt;
w is the size of the window in units&lt;br&gt;
m is the total number of the first feature of interest in the window&lt;br&gt;
r is the total number of the second feature of interest in the window&lt;br&gt;
T is the total number of units in the text.
&lt;/blockquote&gt;
&lt;p&gt;A rolling ratio avoids the problem of a zero in the denominator of a fraction so long as the window is large enough that no segments completely lack either unit&lt;em&gt;.&lt;/em&gt;&lt;/p&gt;
&lt;h3&gt;Window Orientation&lt;/h3&gt;
&lt;p&gt;Using a &lt;em&gt;forward-looking window&lt;/em&gt;, it is not possible to calculate the average or ratio for the final w-1 units of the text, since this span of units is less than w, the size of the window. This span is lost to the analysis. Using a &lt;em&gt;backward-looking window&lt;/em&gt; that begins at the end of the text and moves towards the beginning will captures this data, but at the cost of losing the first w-1 units. A &lt;em&gt;centered window&lt;/em&gt; would start the window at the (w/2)&lt;sup&gt;th&lt;/sup&gt; point in the document and end at the (T-(w/2))&lt;sup&gt;th&lt;/sup&gt; point. This method leaves unanalyzed segments at both the beginning and the end of the document that are each only half the size (w/2 units long) of those generated by the forward- or backward-looking windows.&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;Currently, Lexos is only able to produce forward-looking window analyses.&lt;/strong&gt;
&lt;script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" type="text/javascript"&gt;&lt;/script&gt;</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2015-07-12T13:27:01+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:415896"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/rolling-window-formal-description"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/rolling-window-analysis-video">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Media"/>
    <scalar:isLive>1</scalar:isLive>
    <art:thumbnail rdf:resource="https://i.ytimg.com/vi/PYUtMbxXcUg/hqdefault.jpg"/>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2015-07-01T13:32:39+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:163609"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/rolling-window-analysis-video.4"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/rolling-window-analysis-video.4"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/rolling-window-analysis-video.4">
    <ov:versionnumber>4</ov:versionnumber>
    <dcterms:title>Rolling Window Analysis</dcterms:title>
    <dcterms:description>Rolling Window Analysis.https://www.youtube.com/watch?v=MX6AUX1b1w0</dcterms:description>
    <art:url rdf:resource="https://www.youtube.com/watch?v=PYUtMbxXcUg"/>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2015-07-01T14:36:47+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:407813"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/rolling-window-analysis-video"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
    <dcterms:source>YouTube</dcterms:source>
    <art:sourceLocation rdf:resource="https://www.youtube.com/watch?v=PYUtMbxXcUg"/>
    <dcterms:date>2015-07-01T16:48:49.000Z</dcterms:date>
    <dcterms:identifier>PYUtMbxXcUg</dcterms:identifier>
  </rdf:Description>

  <rdf:Description rdf:about="urn:scalar:path:839396:839397:1">
    <scalar:urn rdf:resource="urn:scalar:path:839396:839397:1"/>
    <oac:hasBody rdf:resource="http://scalar.usc.edu/works/lexos/scrubbing.7"/>
    <oac:hasTarget rdf:resource="http://scalar.usc.edu/works/lexos/special-characters-and-non-roman-writing-systems.2#index=1"/>
    <rdf:type rdf:resource="http://www.openannotation.org/ns/Annotation"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/special-characters-and-non-roman-writing-systems">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2015-08-28T18:37:02+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:176682"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/special-characters-and-non-roman-writing-systems.2"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/special-characters-and-non-roman-writing-systems.2"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/special-characters-and-non-roman-writing-systems.2">
    <ov:versionnumber>2</ov:versionnumber>
    <dcterms:title>Special Characters and Non-Roman Writing Systems</dcterms:title>
    <dcterms:description>Pre-processing considerations</dcterms:description>
    <sioc:content>Discussion of pre-processing considerations, character encoding, etc. for pre-processing.&lt;br /&gt;&lt;br /&gt;This path has not yet been developed.</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-08-22T17:42:38+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:839397"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/special-characters-and-non-roman-writing-systems"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/silhouette-scores">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-08-19T17:37:50+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:318938"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/silhouette-scores.1"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/silhouette-scores.1"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/silhouette-scores.1">
    <ov:versionnumber>1</ov:versionnumber>
    <dcterms:title>Silhouette Scores</dcterms:title>
    <dcterms:description>Description of silhouette scores and their use</dcterms:description>
    <sioc:content>&lt;p&gt;A &lt;strong&gt;&lt;a target="_blank" href="https://en.wikipedia.org/wiki/Silhouette_(clustering)"&gt;Silhouette Score&lt;/a&gt;&lt;/strong&gt;, or &lt;strong&gt;Silhouette Coefficient&lt;/strong&gt;, is a measure of fit for your clusters. It gives a general indication of how well individual objects lie within their cluster. A score of 1 indicates tight, distinct clusters. Values near 0 indicate overlapping clusters. Negative values generally indicate that a sample has been assigned to the wrong cluster, as a different cluster is more similar.&lt;/p&gt;&lt;p&gt;Silhouette scores are calculated from&lt;/p&gt;&lt;ul&gt;&lt;li&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; The mean distance between an object and all other objects in the same cluster.&lt;/li&gt;&lt;li&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; The mean distance between an object and all other objects in the next nearest cluster.&lt;/li&gt;&lt;/ul&gt;&lt;p&gt;Applied to documents, a single silhouette score is calculated for each document. The silhouette score for a collection of documents is the mean of the silhouette scores for each individual document.&lt;/p&gt;&lt;p&gt;Silhouette scores are most easily calculated from flat clusterings like those produced by &lt;a href="k-means-clustering"&gt;k-means&lt;/a&gt;. In a flat cluster, where every object is represented in two-dimensional space, simple Euclidean distance (or any distance metric that can be substituted) can be used to measure difference between every point on the plane. The Lexos K-Means Tool offers all the distance metrics available in its Hierarchical Clustering Tool for calculating silhouette scores.&lt;/p&gt;&lt;p&gt;Generating a silhouette score for hierarchical clusterings is slightly more complicated. Since in a hierarchical cluster analysis clusters can also form parts of other clusters, the hierarchy must first be &amp;quot;flattened&amp;quot; so that documents can be assigned to specific clusters on a flat plane. There are many ways of doing this. Perhaps the easiest to understand is the use of a maximum number of clusters as a method for assigning documents. In hierarchical clustering, each document is technically a cluster of 1, so the maximum number of clusters possible is the same as the number of leaves in the dendrogram. Since a dendrogram needs to have at least one branching structure, the minimum number of possible clusters is 2. In other words, we can choose to provide a &lt;strong&gt;threshold&lt;/strong&gt; of 2 to the maximum number of leaves in our dendrogram as the maximum number of clusters we will allow in our flattened hierarchy.&lt;/p&gt;&lt;p&gt;One way to think about this is to imagine a line bisecting a vertically-oriented dendrogram at a random point on the x-axis. How confident can we be that all the documents on the left belong together (and the same for the documents on the right)? If we draw lines between every document, we are naturally very confident that each document belongs with itself. If we draw a smaller number of lines, we will lose confidence, but we might still find that the groupings acceptable. Manipulating the maximum cluster threshold, then, is one way in which we could examine how robust our groupings are.&lt;/p&gt;&lt;p&gt;Another method we could adopt to flatten a hierarchical cluster is to set the threshold according to some branch height in the dendrogram. We can then compare the height of each link in the tree with the average height of the adjacent links that are less than, say, two levels below it. This comparison is known as the &lt;strong&gt;inconsistency coefficient&lt;/strong&gt;, the theory being that a clade with branch height that is very different from the heights of the adjacent clades is &amp;quot;inconsistent&amp;quot; with them.&lt;/p&gt;&lt;p&gt;At present, Lexos provides only the maximum cluster criterion as a method of flattening the dendrogram, although the inconsistency criterion and others will be added in the future.&lt;/p&gt;&lt;p&gt;If you are wondering why silhouette scores have that name, imagine the silhouette score for each document in a cluster plotted as a horizontal line (the length of the line being proportional to the score). With the lines for each document piled on top of each other, they will form the appearance of a solid shape&amp;mdash;a silhouette&amp;mdash;which will be different for each cluster. Peter Rousseeuw&amp;#39;s original description of silhouettes is quite accessible, even for the non-mathematician reader.&lt;/p&gt;&lt;p&gt;See Peter J. Rousseeuw (1987). &amp;quot;Silhouettes: a Graphical Aid to the Interpretation and Validation of Cluster Analysis&amp;quot;. &lt;em&gt;Computational and Applied Mathematics&lt;/em&gt;. 20: 53&amp;ndash;65.&lt;/p&gt;</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-08-19T17:37:50+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:838290"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/silhouette-scores"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/the-role-of-pre-processing">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2015-08-28T18:32:09+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:176678"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/the-role-of-pre-processing.1"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/the-role-of-pre-processing.1"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/the-role-of-pre-processing.1">
    <ov:versionnumber>1</ov:versionnumber>
    <dcterms:title>The Role of Pre-Processing</dcterms:title>
    <dcterms:description>General considerations for pre-processing texts.</dcterms:description>
    <sioc:content>This page will outline the nature of pre-processing tasks and discuss their importance, the amount time normally given to them, and their importance to the results of quantitative analysis.</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2015-08-28T18:32:10+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:438863"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/the-role-of-pre-processing"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/topics">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2015-08-16T00:05:46+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:173669"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/topics.6"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/topics.6"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/topics.6">
    <ov:versionnumber>6</ov:versionnumber>
    <dcterms:title>Topics</dcterms:title>
    <dcterms:description>Explore this path to learn about the Lexomic methods</dcterms:description>
    <sioc:content>Explore this path to learn about the Lexomic methods</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <scalar:continue_to_content_id>160761</scalar:continue_to_content_id>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2017-02-27T13:20:53+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:1044730"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/topics"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="urn:scalar:path:1044730:839395:1">
    <scalar:urn rdf:resource="urn:scalar:path:1044730:839395:1"/>
    <oac:hasBody rdf:resource="http://scalar.usc.edu/works/lexos/topics.6"/>
    <oac:hasTarget rdf:resource="http://scalar.usc.edu/works/lexos/pre-processing.5#index=1"/>
    <rdf:type rdf:resource="http://www.openannotation.org/ns/Annotation"/>
  </rdf:Description>

  <rdf:Description rdf:about="urn:scalar:path:1044730:840503:2">
    <scalar:urn rdf:resource="urn:scalar:path:1044730:840503:2"/>
    <oac:hasBody rdf:resource="http://scalar.usc.edu/works/lexos/topics.6"/>
    <oac:hasTarget rdf:resource="http://scalar.usc.edu/works/lexos/visualization.1#index=2"/>
    <rdf:type rdf:resource="http://www.openannotation.org/ns/Annotation"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/visualization">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-08-25T14:08:10+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:319597"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/visualization.1"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/visualization.1"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/visualization.1">
    <ov:versionnumber>1</ov:versionnumber>
    <dcterms:title>Visualization</dcterms:title>
    <dcterms:description>Start of visualization topics path</dcterms:description>
    <sioc:content>This currently is an undeveloped path. Until we have developed fully, we will list relevant internal and external resources as unordered bullet points.&lt;ul&gt;&lt;li&gt;&lt;span id="eow-title" class="watch-title" dir="ltr" title="Miriam Posner: Data Trouble: Why Humanists Have Problems with Datavis, and Why Anyone Should Care"&gt;&lt;a target="_blank" href="https://www.youtube.com/watch?v=sW0u1pNQNxc"&gt;Miriam Posner: Data Trouble: Why Humanists Have Problems with Datavis, and Why Anyone Should Care&lt;/a&gt; &lt;/span&gt;(YouTube)&amp;nbsp;&lt;/li&gt;&lt;/ul&gt;</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2016-08-25T14:08:10+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:840503"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/visualization"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="urn:scalar:path:1044730:838227:3">
    <scalar:urn rdf:resource="urn:scalar:path:1044730:838227:3"/>
    <oac:hasBody rdf:resource="http://scalar.usc.edu/works/lexos/topics.6"/>
    <oac:hasTarget rdf:resource="http://scalar.usc.edu/works/lexos/cluster-analysis.35#index=3"/>
    <rdf:type rdf:resource="http://www.openannotation.org/ns/Annotation"/>
  </rdf:Description>

  <rdf:Description rdf:about="urn:scalar:path:1044730:1044728:4">
    <scalar:urn rdf:resource="urn:scalar:path:1044730:1044728:4"/>
    <oac:hasBody rdf:resource="http://scalar.usc.edu/works/lexos/topics.6"/>
    <oac:hasTarget rdf:resource="http://scalar.usc.edu/works/lexos/epistemology.1#index=4"/>
    <rdf:type rdf:resource="http://www.openannotation.org/ns/Annotation"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/using-lexos-over-the-web-vs-local-installation">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Composite"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/6902"/>
    <dcterms:created>2015-06-10T08:08:15+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:160608"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/using-lexos-over-the-web-vs-local-installation.4"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/using-lexos-over-the-web-vs-local-installation.4"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/using-lexos-over-the-web-vs-local-installation.4">
    <ov:versionnumber>4</ov:versionnumber>
    <dcterms:title>Using Lexos: Over the Web vs. Local Installation</dcterms:title>
    <sioc:content>The quickest and simplest way to use the Lexos suite of tools is through the Lexos website, located at &lt;a href="http://lexos.wheatoncollege.edu"&gt;lexos.wheatoncollege.edu&lt;/a&gt;.&amp;nbsp;&lt;div&gt;
&lt;/div&gt;&lt;div&gt;However, for those with a significant number of large files, downloading and installing the Lexos suite locally may be a better option. Installation instructions can be found by clicking the appropriate link:&lt;/div&gt;&lt;div&gt;
&lt;/div&gt;&lt;div&gt;&lt;a href="media/1_macosxinstallguide" data-display-content-preview-box="true"&gt;MacOS&lt;/a&gt;
&lt;/div&gt;&lt;div&gt;&lt;a href="media/windows-install-directions" data-display-content-preview-box="true"&gt;Windows&lt;/a&gt;
&lt;/div&gt;&lt;div&gt;&lt;a href="media/linux-install-directions" data-display-content-preview-box="true"&gt;Linux&lt;/a&gt;&amp;nbsp;&lt;/div&gt;</sioc:content>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3689"/>
    <dcterms:created>2015-08-11T19:25:50+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:431001"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/using-lexos-over-the-web-vs-local-installation"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/zotero-bibliography">
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Media"/>
    <scalar:isLive>1</scalar:isLive>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2015-06-24T12:58:01+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:content:162318"/>
    <scalar:version rdf:resource="http://scalar.usc.edu/works/lexos/zotero-bibliography.3"/>
    <dcterms:hasVersion rdf:resource="http://scalar.usc.edu/works/lexos/zotero-bibliography.3"/>
    <scalar:citation>method=instancesof/content;methodNumNodes=63;</scalar:citation>
  </rdf:Description>

  <rdf:Description rdf:about="http://scalar.usc.edu/works/lexos/zotero-bibliography.3">
    <ov:versionnumber>3</ov:versionnumber>
    <dcterms:title>Zotero Bibliography</dcterms:title>
    <art:url rdf:resource="http://bibbase.org/show?bib=https%3A%2F%2Fapi.zotero.org%2Fgroups%2F47671%2Fitems%3Fkey%3DrjMBGMVIUHoqwuGZRNI4HppO%26format%3Dbibtex%26limit%3D100"/>
    <scalar:defaultView>plain</scalar:defaultView>
    <prov:wasAttributedTo rdf:resource="http://scalar.usc.edu/works/lexos/users/3693"/>
    <dcterms:created>2015-06-24T18:33:42+00:00</dcterms:created>
    <scalar:urn rdf:resource="urn:scalar:version:404876"/>
    <dcterms:isVersionOf rdf:resource="http://scalar.usc.edu/works/lexos/zotero-bibliography"/>
    <rdf:type rdf:resource="http://scalar.usc.edu/2012/01/scalar-ns#Version"/>
  </rdf:Description>

</rdf:RDF>