2 * Copyright (C) 2014 Open Whisper Systems
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
18 package org
.asamk
.signal
.logging
;
20 import java
.util
.Arrays
;
21 import java
.util
.HashSet
;
22 import java
.util
.Locale
;
24 import java
.util
.regex
.Matcher
;
25 import java
.util
.regex
.Pattern
;
28 * Scrub data for possibly sensitive information.
30 public final class Scrubber
{
36 * The middle group will be censored.
37 * Supposedly, the shortest international phone numbers in use contain seven digits.
38 * Handles URL encoded +, %2B
40 private static final Pattern E164_PATTERN
= Pattern
.compile("(\\+|%2B|_)(\\d{5,13})(\\d{2})");
41 private static final String E164_CENSOR
= "*************";
43 private static final Pattern GROUP_V1_ID_PATTERN
= Pattern
.compile(
44 "(/org/asamk/Signal/.*Groups/[a-zA-Z0-9/_+-]{2}|[a-zA-Z0-9/_+-]{2})([a-zA-Z0-9/_+-]{18})([a-zA-Z0-9/_+-]{2})(==|__)");
45 private static final String GROUP_V1_ID_CENSOR
= "*".repeat(18);
47 private static final Pattern GROUP_V2_ID_PATTERN
= Pattern
.compile(
48 "(/org/asamk/Signal/.*Groups/[a-zA-Z0-9/_+-]{2}|[a-zA-Z0-9/_+-]{2})([a-zA-Z0-9/_+-]{39})([a-zA-Z0-9/_+-]{2})([=_])");
49 private static final String GROUP_V2_ID_CENSOR
= "*".repeat(39);
52 * The second group will be censored.
54 private static final Pattern CRUDE_EMAIL_PATTERN
= Pattern
.compile("\\b([^\\s/])([^\\s/]*@[^\\s]+)");
55 private static final String EMAIL_CENSOR
= "...@...";
58 * The middle group will be censored.
60 private static final Pattern UUID_PATTERN
= Pattern
.compile(
61 "(JOB::)?([0-9a-f]{8}[-_][0-9a-f]{4}[-_][0-9a-f]{4}[-_][0-9a-f]{4}[-_][0-9a-f]{10})([0-9a-f]{2})",
62 Pattern
.CASE_INSENSITIVE
);
63 private static final String UUID_CENSOR
= "********-****-****-****-**********";
66 * The entire string is censored.
68 private static final Pattern IPV4_PATTERN
= Pattern
.compile("\\b"
69 + "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\."
70 + "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\."
71 + "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\."
72 + "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)"
74 private static final String IPV4_CENSOR
= "...ipv4...";
77 * The domain name except for TLD will be censored.
79 private static final Pattern DOMAIN_PATTERN
= Pattern
.compile("([a-z0-9]+\\.)+([a-z0-9\\-]*[a-z\\-][a-z0-9\\-]*)",
80 Pattern
.CASE_INSENSITIVE
);
81 private static final String DOMAIN_CENSOR
= "***.";
82 private static final Set
<String
> TOP_100_TLDS
= new HashSet
<>(Arrays
.asList("com",
181 public static CharSequence
scrub(CharSequence
in) {
186 in = scrubGroupV2Ids(in);
187 in = scrubGroupV1Ids(in);
188 in = scrubDomains(in);
194 private static CharSequence
scrubE164(CharSequence
in) {
197 (matcher
, output
) -> output
.append(matcher
.group(1))
198 .append(E164_CENSOR
, 0, matcher
.group(2).length())
199 .append(matcher
.group(3)));
202 private static CharSequence
scrubGroupV1Ids(CharSequence
in) {
205 (matcher
, output
) -> output
.append(matcher
.group(1))
206 .append(GROUP_V1_ID_CENSOR
, 0, matcher
.group(2).length())
207 .append(matcher
.group(3)));
210 private static CharSequence
scrubGroupV2Ids(CharSequence
in) {
213 (matcher
, output
) -> output
.append(matcher
.group(1))
214 .append(GROUP_V2_ID_CENSOR
, 0, matcher
.group(2).length())
215 .append(matcher
.group(3)));
218 private static CharSequence
scrubEmail(CharSequence
in) {
221 (matcher
, output
) -> output
.append(matcher
.group(1)).append(EMAIL_CENSOR
));
224 private static CharSequence
scrubUuids(CharSequence
in) {
225 return scrub(in, UUID_PATTERN
, (matcher
, output
) -> {
226 if (matcher
.group(1) != null && !matcher
.group(1).isEmpty()) {
227 output
.append(matcher
.group(1)).append(matcher
.group(2)).append(matcher
.group(3));
229 output
.append(UUID_CENSOR
).append(matcher
.group(3));
234 private static CharSequence
scrubDomains(CharSequence
in) {
235 return scrub(in, DOMAIN_PATTERN
, (matcher
, output
) -> {
236 String match
= matcher
.group(0);
237 if (matcher
.groupCount() == 2
238 && TOP_100_TLDS
.contains(matcher
.group(2).toLowerCase(Locale
.US
))
239 && !match
.endsWith("whispersystems.org")
240 && !match
.endsWith("signal.org")) {
241 output
.append(DOMAIN_CENSOR
).append(matcher
.group(2));
243 output
.append(match
);
248 private static CharSequence
scrubIpv4(CharSequence
in) {
249 return scrub(in, IPV4_PATTERN
, (matcher
, output
) -> output
.append(IPV4_CENSOR
));
252 private static CharSequence
scrub(CharSequence
in, Pattern pattern
, ProcessMatch processMatch
) {
253 final StringBuilder output
= new StringBuilder(in.length());
254 final Matcher matcher
= pattern
.matcher(in);
256 int lastEndingPos
= 0;
258 while (matcher
.find()) {
259 output
.append(in, lastEndingPos
, matcher
.start());
261 processMatch
.scrubMatch(matcher
, output
);
263 lastEndingPos
= matcher
.end();
266 if (lastEndingPos
== 0) {
267 // there were no matches, save copying all the data
270 output
.append(in, lastEndingPos
, in.length());
276 private interface ProcessMatch
{
278 void scrubMatch(Matcher matcher
, StringBuilder output
);