From 8d269d042ac0577a8acfbf00d4e57bae5a344124 Mon Sep 17 00:00:00 2001 From: Ben Balter Date: Fri, 28 Mar 2014 12:41:12 -0400 Subject: [PATCH 1/2] prevent errors when parent not found --- lib/word-to-markdown.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/word-to-markdown.rb b/lib/word-to-markdown.rb index aedb876..bdaaa19 100644 --- a/lib/word-to-markdown.rb +++ b/lib/word-to-markdown.rb @@ -15,6 +15,7 @@ class WordToMarkdown .MsoListParagraphCxSpMiddle .MsoListParagraphCxSpLast .MsoListParagraph + ul ] attr_reader :path, :doc @@ -178,7 +179,7 @@ def semanticize! # Convert list paragraphs to actual numbered and unnumbered lists node.node_name = "li" - node.parent = list + node.parent = list if list # Scrub unicode bullets span = node.css("span:first")[1] From ae4b60da2055706486e98a172272c8cd364e7649 Mon Sep 17 00:00:00 2001 From: Ben Balter Date: Sun, 30 Mar 2014 14:14:03 -0400 Subject: [PATCH 2/2] initial gdoc support --- lib/word-to-markdown.rb | 33 +++-- test/fixtures/gdoc.htm | 215 ++++++++++++++++++++++++++++ test/test_word_to_markdown_lists.rb | 4 + 3 files changed, 242 insertions(+), 10 deletions(-) create mode 100644 test/fixtures/gdoc.htm diff --git a/lib/word-to-markdown.rb b/lib/word-to-markdown.rb index bdaaa19..2d53f39 100644 --- a/lib/word-to-markdown.rb +++ b/lib/word-to-markdown.rb @@ -15,7 +15,7 @@ class WordToMarkdown .MsoListParagraphCxSpMiddle .MsoListParagraphCxSpLast .MsoListParagraph - ul + li ] attr_reader :path, :doc @@ -147,6 +147,20 @@ def li_selectors LI_SELECTORS.join(",") end + # Returns an array of all indented values + def indents + @indents ||= doc.css(li_selectors).map{ |el| el.indent }.uniq.sort + end + + # Determine the indent level given an indent value + # + # level - the true indent, e.g., 2.5 (from 2.5em) + # + # Returns an integer representing the indent level + def indent(level) + indents.find_index level + end + # Try to make semantic markup explicit where implied by the export def semanticize! @@ -161,21 +175,20 @@ def semanticize! list_type = "ul" end + # calculate indent level + current_indent = indent(node.indent) + # Determine parent node for this li, creating it if necessary - if node.indent > indent_level + if current_indent > indent_level || indent_level == 0 && node.parent.css(".indent#{current_indent}").empty? list = Nokogiri::XML::Node.new list_type, @doc - list.classes = ["list", "indent#{node.indent}"] - if node.indent == 1 - list.parent = node.parent - else - list.parent = node.parent.css(".indent#{node.indent-1} li").last - end + list.classes = ["list", "indent#{current_indent}"] + list.parent = node.parent.css(".indent#{current_indent-1} li").last || node.parent else - list = node.parent.css(".indent#{node.indent}").last + list = node.parent.css(".indent#{current_indent}").last end # Note our current nesting depth - indent_level = node.indent + indent_level = current_indent # Convert list paragraphs to actual numbered and unnumbered lists node.node_name = "li" diff --git a/test/fixtures/gdoc.htm b/test/fixtures/gdoc.htm new file mode 100644 index 0000000..47a5ff8 --- /dev/null +++ b/test/fixtures/gdoc.htm @@ -0,0 +1,215 @@ + + + + + + + + + + + + + diff --git a/test/test_word_to_markdown_lists.rb b/test/test_word_to_markdown_lists.rb index 1aeb2fd..e35d616 100644 --- a/test/test_word_to_markdown_lists.rb +++ b/test/test_word_to_markdown_lists.rb @@ -22,6 +22,10 @@ class TestWordToMarkdownLists < Test::Unit::TestCase validate_fixture "nested-ul", "- One\n - Sub one\n - Sub sub one\n - Sub sub two\n\n - Sub two\n\n- Two" end + should "parse gdoc nested uls" do + validate_fixture "gdoc", "- Bullet point\n\n - Indented bullet point" + end + should "parse left margin" do doc = WordToMarkdown.new "

foo

" assert_equal 25, doc.doc.css("p").first.left_margin