001////////////////////////////////////////////////////////////////////////////////
002// checkstyle: Checks Java source code for adherence to a set of rules.
003// Copyright (C) 2001-2015 the original author or authors.
004//
005// This library is free software; you can redistribute it and/or
006// modify it under the terms of the GNU Lesser General Public
007// License as published by the Free Software Foundation; either
008// version 2.1 of the License, or (at your option) any later version.
009//
010// This library is distributed in the hope that it will be useful,
011// but WITHOUT ANY WARRANTY; without even the implied warranty of
012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
013// Lesser General Public License for more details.
014//
015// You should have received a copy of the GNU Lesser General Public
016// License along with this library; if not, write to the Free Software
017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
018////////////////////////////////////////////////////////////////////////////////
019
020package com.puppycrawl.tools.checkstyle.checks;
021
022import java.util.regex.Matcher;
023import java.util.regex.Pattern;
024
025import com.puppycrawl.tools.checkstyle.api.Check;
026import com.puppycrawl.tools.checkstyle.api.DetailAST;
027import com.puppycrawl.tools.checkstyle.api.TokenTypes;
028
029/**
030 * <p>
031 * Restrict using <a href =
032 * "http://docs.oracle.com/javase/specs/jls/se8/html/jls-3.html#jls-3.3">
033 * Unicode escapes</a> (e.g. \u221e).
034 * It is possible to allow using escapes for
035 * <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
036 * non-printable(control) characters</a>.
037 * Also, this check can be configured to allow using escapes
038 * if trail comment is present. By the option it is possible to
039 * allow using escapes if literal contains only them. By the option it
040 * is possible to allow using escapes for space literals.
041 * </p>
042 * <p>
043 * Examples of using Unicode:</p>
044 * <pre>
045 * String unitAbbrev = "μs"; //Best: perfectly clear even without a comment.
046 * String unitAbbrev = "\u03bcs"; //Poor: the reader has no idea what this is.
047 * </pre>
048 * <p>
049 * An example of how to configure the check is:
050 * </p>
051 * <pre>
052 * &lt;module name="AvoidEscapedUnicodeCharacters"/&gt;
053 * </pre>
054 * <p>
055 * An example of non-printable(control) characters.
056 * </p>
057 * <pre>
058 * return '\ufeff' + content; // byte order mark
059 * </pre>
060 * <p>
061 * An example of how to configure the check to allow using escapes
062 * for non-printable(control) characters:
063 * </p>
064 * <pre>
065 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
066 *     &lt;property name="allowEscapesForControlCharacters" value="true"/&gt;
067 * &lt;/module&gt;
068 * </pre>
069 * <p>
070 * Example of using escapes with trail comment:
071 * </p>
072 * <pre>
073 * String unitAbbrev = "\u03bcs"; // Greek letter mu, "s"
074 * </pre>
075 * <p>An example of how to configure the check to allow using escapes
076 * if trail comment is present:
077 * </p>
078 * <pre>
079 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
080 *     &lt;property name="allowByTailComment" value="true"/&gt;
081 * &lt;/module&gt;
082 * </pre>
083 * <p>Example of using escapes if literal contains only them:
084 * </p>
085 * <pre>
086 * String unitAbbrev = "\u03bc\u03bc\u03bc";
087 * </pre>
088 * <p>An example of how to configure the check to allow escapes
089 * if literal contains only them:
090 * </p>
091 * <pre>
092 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
093 *    &lt;property name="allowIfAllCharactersEscaped" value="true"/&gt;
094 * &lt;/module&gt;
095 * </pre>
096 * <p>An example of how to configure the check to allow non-printable escapes:
097 * </p>
098 * <pre>
099 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
100 *    &lt;property name="allowNonPrintableEscapes" value="true"/&gt;
101 * &lt;/module&gt;
102 * </pre>
103 *
104 * @author maxvetrenko
105 *
106 */
107public class AvoidEscapedUnicodeCharactersCheck
108    extends Check {
109    /** Regular expression for Unicode chars. */
110    private static final Pattern UNICODE_REGEXP = Pattern.compile("\\\\u[a-fA-F0-9]{4}");
111
112    /** Regular expression Unicode control characters. */
113    private static final Pattern UNICODE_CONTROL = Pattern.compile("\\\\(u|U)"
114            + "(00[0-1][0-1A-Fa-f]|00[8-9][0-9A-Fa-f]|034(f|F)|070(f|F)"
115            + "|180(e|E)|200[b-fB-F]|202[b-eB-E]|206[0-4a-fA-F]"
116            + "|[fF]{3}[9a-bA-B]|[fF][eE][fF]{2})");
117
118    /** Regular expression for trail comment. */
119    private static final Pattern COMMENT_REGEXP = Pattern.compile(";[ ]*//+"
120            + "[a-zA-Z0-9 ]*|;[ ]*/[*]+[a-zA-Z0-9 ]*");
121
122    /** Regular expression for all escaped chars. */
123    private static final Pattern ALL_ESCAPED_CHARS =
124            Pattern.compile("^((\\\\u)[a-fA-F0-9]{4}"
125                    + "||\\\\b|\\\\t|\\\\n|\\\\f|\\\\r|\\\\|\"|\')+$");
126
127    /** Regular expression for non-printable unicode chars. */
128    private static final Pattern NON_PRINTABLE_CHARS = Pattern.compile("\\\\u1680|\\\\u2028"
129            + "|\\\\u2029|\\\\u205(f|F)|\\\\u3000|\\\\u2007|\\\\u2000|\\\\u200(a|A)"
130            + "|\\\\u007(F|f)|\\\\u009(f|F)|\\\\u(f|F){4}|\\\\u007(F|f)|\\\\u00(a|A)(d|D)"
131            + "|\\\\u0600|\\\\u061(c|C)|\\\\u06(d|D){2}|\\\\u070(f|F)|\\\\u1680|\\\\u180(e|E)"
132            + "|\\\\u2000|\\\\u2028|\\\\u205(f|F)|\\\\u2066|\\\\u2067|\\\\u2068|\\\\u2069"
133            + "|\\\\u206(a|A)|\\\\u(d|D)800|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){3}9"
134            + "|\\\\u(f|F){3}(a|A)|\\\\u0020|\\\\u00(a|A)0|\\\\u00(a|A)(d|D)|\\\\u0604"
135            + "|\\\\u061(c|C)|\\\\u06(d|D){2}|\\\\u070(f|F)|\\\\u1680|\\\\u180(e|E)|\\\\u200(f|F)"
136            + "|\\\\u202(f|F)|\\\\u2064|\\\\u2066|\\\\u2067|\\\\u2068|\\\\u2069|\\\\u206(f|F)"
137            + "|\\\\u(f|F)8(f|F){2}|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){3}9|\\\\u(f|F){3}(b|B)"
138            + "|\\\\u05(d|D)0|\\\\u05(f|F)3|\\\\u0600|\\\\u0750|\\\\u0(e|E)00|\\\\u1(e|E)00"
139            + "|\\\\u2100|\\\\u(f|F)(b|B)50|\\\\u(f|F)(e|E)70|\\\\u(F|f){2}61|\\\\u04(f|F)9"
140            + "|\\\\u05(b|B)(e|E)|\\\\u05(e|E)(a|A)|\\\\u05(f|F)4|\\\\u06(f|F){2}"
141            + "|\\\\u077(f|F)|\\\\u0(e|E)7(f|F)|\\\\u20(a|A)(f|F)|\\\\u213(a|A)|\\\\u0000"
142            + "|\\\\u(f|F)(d|D)(f|F){2}|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){2}(d|D)(c|C)"
143            + "|\\\\u2002|\\\\u0085|\\\\u200(a|A)|\\\\u2005|\\\\u2000|\\\\u2029|\\\\u000(B|b)"
144            + "|\\\\u2008|\\\\u2003|\\\\u205(f|F)|\\\\u1680|\\\\u0009|\\\\u0020|\\\\u2006"
145            + "|\\\\u2001|\\\\u202(f|F)|\\\\u00(a|A)0|\\\\u000(c|C)|\\\\u2009|\\\\u2004|\\\\u2028"
146            + "|\\\\u2028|\\\\u2007|\\\\u2004|\\\\u2028|\\\\u2007|\\\\u2025"
147            + "|\\\\u(f|F){2}0(e|E)|\\\\u(f|F){2}61");
148
149    /** Allow use escapes for non-printable(control) characters.  */
150    private boolean allowEscapesForControlCharacters;
151
152    /** Allow use escapes if trail comment is present. */
153    private boolean allowByTailComment;
154
155    /** Allow if all characters in literal are escaped. */
156    private boolean allowIfAllCharactersEscaped;
157
158    /** Allow escapes for space literals. */
159    private boolean allowNonPrintableEscapes;
160
161    /**
162     * Set allowIfAllCharactersEscaped.
163     * @param allow user's value.
164     */
165    public final void setAllowEscapesForControlCharacters(boolean allow) {
166        allowEscapesForControlCharacters = allow;
167    }
168
169    /**
170     * Set allowByTailComment.
171     * @param allow user's value.
172     */
173    public final void setAllowByTailComment(boolean allow) {
174        allowByTailComment = allow;
175    }
176
177    /**
178     * Set allowIfAllCharactersEscaped.
179     * @param allow user's value.
180     */
181    public final void setAllowIfAllCharactersEscaped(boolean allow) {
182        allowIfAllCharactersEscaped = allow;
183    }
184
185    /**
186     * Set allowSpaceEscapes.
187     * @param allow user's value.
188     */
189    public final void setAllowNonPrintableEscapes(boolean allow) {
190        allowNonPrintableEscapes = allow;
191    }
192
193    @Override
194    public int[] getDefaultTokens() {
195        return getAcceptableTokens();
196    }
197
198    @Override
199    public int[] getAcceptableTokens() {
200        return new int[] {TokenTypes.STRING_LITERAL, TokenTypes.CHAR_LITERAL};
201    }
202
203    @Override
204    public int[] getRequiredTokens() {
205        return getAcceptableTokens();
206    }
207
208    @Override
209    public void visitToken(DetailAST ast) {
210
211        final String literal = ast.getText();
212
213        if (hasUnicodeChar(literal) && !(allowByTailComment && hasTrailComment(ast)
214                || isAllCharactersEscaped(literal)
215                || allowEscapesForControlCharacters
216                        && isOnlyUnicodeValidChars(literal, UNICODE_CONTROL)
217                || allowNonPrintableEscapes
218                        && isOnlyUnicodeValidChars(literal, NON_PRINTABLE_CHARS))) {
219            log(ast.getLineNo(), "forbid.escaped.unicode.char");
220        }
221    }
222
223    /**
224     * Checks if literal has Unicode chars.
225     * @param literal String literal.
226     * @return true if literal has Unicode chars.
227     */
228    private static boolean hasUnicodeChar(String literal) {
229        return UNICODE_REGEXP.matcher(literal).find();
230    }
231
232    /**
233     * Check if String literal contains Unicode control chars.
234     * @param literal String literal.
235     * @param pattern RegExp for valid characters.
236     * @return true, if String literal contains Unicode control chars.
237     */
238    private static boolean isOnlyUnicodeValidChars(String literal, Pattern pattern) {
239        final int unicodeMatchesCounter =
240                countMatches(UNICODE_REGEXP, literal);
241        final int unicodeValidMatchesCounter =
242                countMatches(pattern, literal);
243        return unicodeMatchesCounter - unicodeValidMatchesCounter == 0;
244    }
245
246    /**
247     * Check if trail comment is present after ast token.
248     * @param ast current token.
249     * @return true if trail comment is present after ast token.
250     */
251    private boolean hasTrailComment(DetailAST ast) {
252        final DetailAST variableDef = getVariableDef(ast);
253        DetailAST semi;
254
255        if (variableDef == null) {
256            semi = getSemi(ast);
257        }
258        else {
259            semi = variableDef.getNextSibling();
260
261            if (semi.getType() != TokenTypes.SEMI) {
262                semi = variableDef.getLastChild();
263            }
264        }
265
266        boolean result = false;
267        if (semi != null) {
268            final int lineNo = semi.getLineNo();
269            final String currentLine = getLine(lineNo - 1);
270
271            if (COMMENT_REGEXP.matcher(currentLine).find()) {
272                result = true;
273            }
274        }
275
276        return result;
277    }
278
279    /**
280     * Count regexp matches into String literal.
281     * @param pattern pattern.
282     * @param target String literal.
283     * @return count of regexp matches.
284     */
285    private static int countMatches(Pattern pattern, String target) {
286        int matcherCounter = 0;
287        final Matcher matcher = pattern.matcher(target);
288        while (matcher.find()) {
289            matcherCounter++;
290        }
291        return matcherCounter;
292    }
293
294    /**
295     * Get variable definition.
296     * @param ast current token.
297     * @return variable definition.
298     */
299    private static DetailAST getVariableDef(DetailAST ast) {
300        DetailAST result = ast.getParent();
301        while (result != null
302                && result.getType() != TokenTypes.VARIABLE_DEF) {
303            result = result.getParent();
304        }
305        return result;
306    }
307
308    /**
309     * Get semi token.
310     * @param ast current token.
311     * @return semi token or null.
312     */
313    private static DetailAST getSemi(DetailAST ast) {
314        DetailAST result = ast.getParent();
315        while (result != null
316                && result.getLastChild().getType() != TokenTypes.SEMI) {
317            result = result.getParent();
318        }
319        if (result != null) {
320            result = result.getLastChild();
321        }
322        return result;
323    }
324
325    /**
326     * Checks if all characters in String literal is escaped.
327     * @param literal current literal.
328     * @return true if all characters in String literal is escaped.
329     */
330    private boolean isAllCharactersEscaped(String literal) {
331        return allowIfAllCharactersEscaped
332                && ALL_ESCAPED_CHARS.matcher(literal.substring(1,
333                        literal.length() - 1)).find();
334    }
335}