Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Genesys Backend
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
12
Issues
12
List
Boards
Labels
Service Desk
Milestones
Operations
Operations
Incidents
Packages & Registries
Packages & Registries
Container Registry
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Genesys PGR
Genesys Backend
Commits
332e70f1
Commit
332e70f1
authored
Jul 29, 2015
by
Matija Obreza
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
HTML to plaintext converter
parent
eb622064
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
191 additions
and
0 deletions
+191
-0
pom.xml
pom.xml
+5
-0
src/main/java/org/genesys2/server/service/HtmlConverter.java
src/main/java/org/genesys2/server/service/HtmlConverter.java
+25
-0
src/main/java/org/genesys2/server/service/impl/JsoupHtmlConverter.java
.../org/genesys2/server/service/impl/JsoupHtmlConverter.java
+145
-0
src/main/java/org/genesys2/server/servlet/controller/JspHelper.java
...ava/org/genesys2/server/servlet/controller/JspHelper.java
+16
-0
No files found.
pom.xml
View file @
332e70f1
...
...
@@ -491,6 +491,11 @@
<version>
0.8.1
</version>
<scope>
test
</scope>
</dependency>
<dependency>
<groupId>
org.jsoup
</groupId>
<artifactId>
jsoup
</artifactId>
<version>
1.8.2
</version>
</dependency>
</dependencies>
<build>
...
...
src/main/java/org/genesys2/server/service/HtmlConverter.java
0 → 100644
View file @
332e70f1
/**
* Copyright 2014 Global Crop Diversity Trust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
**/
package
org.genesys2.server.service
;
/**
* Convert HTML to plain text
*/
public
interface
HtmlConverter
{
String
toText
(
String
html
);
}
src/main/java/org/genesys2/server/service/impl/JsoupHtmlConverter.java
0 → 100644
View file @
332e70f1
/**
* Copyright 2014 Global Crop Diversity Trust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
**/
package
org.genesys2.server.service.impl
;
import
org.genesys2.server.service.HtmlConverter
;
import
org.jsoup.Jsoup
;
import
org.jsoup.helper.StringUtil
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.nodes.Node
;
import
org.jsoup.nodes.TextNode
;
import
org.jsoup.select.NodeTraversor
;
import
org.jsoup.select.NodeVisitor
;
import
org.springframework.beans.factory.annotation.Value
;
import
org.springframework.cache.annotation.Cacheable
;
import
org.springframework.stereotype.Service
;
/**
* Based on
* https://github.com/jhy/jsoup/blob/master/src/main/java/org/jsoup/examples
* /HtmlToPlainText.java
*/
@Service
public
class
JsoupHtmlConverter
implements
HtmlConverter
{
@Value
(
"${base.url}"
)
private
String
baseUrl
;
@Override
@Cacheable
(
value
=
"htmltotextcache"
,
key
=
"#html"
,
unless
=
"#result == null"
)
public
String
toText
(
String
html
)
{
Document
doc
=
Jsoup
.
parseBodyFragment
(
html
);
return
getPlainText
(
doc
);
}
/**
* Format an Element to plain-text
*
* @param element
* the root element to format
* @return formatted text
*/
private
String
getPlainText
(
Element
element
)
{
FormattingVisitor
formatter
=
new
FormattingVisitor
();
NodeTraversor
traversor
=
new
NodeTraversor
(
formatter
);
// walk the DOM, and call .head() and .tail() for each node
traversor
.
traverse
(
element
);
return
formatter
.
toString
().
trim
();
}
/**
* the formatting rules, implemented in a breadth-first DOM traverse
*/
private
class
FormattingVisitor
implements
NodeVisitor
{
// private static final int maxWidth = 80;
// private int width = 0;
// holds the accumulated text
private
StringBuilder
accum
=
new
StringBuilder
();
/**
* hit when the node is first seen
*/
@Override
public
void
head
(
Node
node
,
int
depth
)
{
String
name
=
node
.
nodeName
();
if
(
node
instanceof
TextNode
)
// TextNodes carry all user-readable text in the DOM.
append
(((
TextNode
)
node
).
text
());
else
if
(
name
.
equals
(
"li"
))
append
(
"\n * "
);
else
if
(
name
.
equals
(
"dt"
))
append
(
" "
);
else
if
(
StringUtil
.
in
(
name
,
"p"
,
"h1"
,
"h2"
,
"h3"
,
"h4"
,
"h5"
,
"tr"
))
append
(
"\n"
);
}
/**
* hit when all of the node's children (if any) have been visited
*/
@Override
public
void
tail
(
Node
node
,
int
depth
)
{
String
name
=
node
.
nodeName
();
if
(
StringUtil
.
in
(
name
,
"br"
,
"dd"
,
"dt"
,
"p"
,
"h1"
,
"h2"
,
"h3"
,
"h4"
,
"h5"
))
append
(
"\n"
);
else
if
(
name
.
equals
(
"a"
))
{
node
.
setBaseUri
(
baseUrl
);
append
(
String
.
format
(
" <%s>"
,
node
.
absUrl
(
"href"
)));
}
}
private
void
append
(
String
text
)
{
// if (text.startsWith("\n"))
// // reset counter if starts with a newline. only from formats above, not in natural text
// width = 0;
if
(
text
.
equals
(
" "
)
&&
(
accum
.
length
()
==
0
||
StringUtil
.
in
(
accum
.
substring
(
accum
.
length
()
-
1
),
" "
,
"\n"
)))
return
;
// don't accumulate long runs of empty spaces
// Wrapping
// if (text.length() + width > maxWidth) { // won't fit, needs to
// wrap
// String words[] = text.split("\\s+");
// for (int i = 0; i < words.length; i++) {
// String word = words[i];
// boolean last = i == words.length - 1;
// if (!last) // insert a space if not the last word
// word = word + " ";
// if (word.length() + width > maxWidth) { // wrap and reset
// // counter
// accum.append("\n").append(word);
// width = word.length();
// } else {
// accum.append(word);
// width += word.length();
// }
// }
// } else
{
// fits as is, without need to wrap text
accum
.
append
(
text
);
// width += text.length();
}
}
@Override
public
String
toString
()
{
return
accum
.
toString
();
}
}
}
src/main/java/org/genesys2/server/servlet/controller/JspHelper.java
View file @
332e70f1
...
...
@@ -19,12 +19,14 @@ package org.genesys2.server.servlet.controller;
import
java.text.DateFormatSymbols
;
import
java.util.Locale
;
import
org.apache.commons.lang.StringUtils
;
import
org.genesys2.server.exception.UserException
;
import
org.genesys2.server.model.impl.Country
;
import
org.genesys2.server.model.impl.Crop
;
import
org.genesys2.server.model.impl.User
;
import
org.genesys2.server.service.CropService
;
import
org.genesys2.server.service.GeoService
;
import
org.genesys2.server.service.HtmlConverter
;
import
org.genesys2.server.service.UserService
;
import
org.springframework.beans.factory.annotation.Autowired
;
import
org.springframework.stereotype.Component
;
...
...
@@ -42,6 +44,8 @@ public class JspHelper {
private
CropService
cropService
;
@Autowired
private
ObjectMapper
objectMapper
;
@Autowired
private
HtmlConverter
htmlConverter
;
public
String
userFullName
(
Long
userId
)
{
if
(
userId
==
null
)
{
...
...
@@ -73,6 +77,18 @@ public class JspHelper {
public
String
toJson
(
Object
object
)
throws
JsonProcessingException
{
return
objectMapper
.
writer
().
writeValueAsString
(
object
);
}
public
String
htmlToText
(
String
html
)
{
return
htmlConverter
.
toText
(
html
);
}
public
String
htmlToText
(
String
html
,
int
maxLength
)
{
return
StringUtils
.
abbreviate
(
htmlConverter
.
toText
(
html
),
maxLength
);
}
public
String
abbreviate
(
String
text
,
int
maxLength
)
{
return
StringUtils
.
abbreviate
(
text
,
maxLength
);
}
public
String
[]
monthNames
(
Locale
locale
)
{
DateFormatSymbols
dfs
=
new
DateFormatSymbols
(
locale
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment