/*
* Copyright (C) 2014 Open Whisper Systems
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
package org.asamk.signal.logging;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Locale;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Scrub data for possibly sensitive information.
*/
public final class Scrubber {
private Scrubber() {
}
/**
* The middle group will be censored.
* Supposedly, the shortest international phone numbers in use contain seven digits.
* Handles URL encoded +, %2B
*/
private static final Pattern E164_PATTERN = Pattern.compile("(\\+|%2B|_)(\\d{5,13})(\\d{2})");
private static final String E164_CENSOR = "*************";
private static final Pattern GROUP_V1_ID_PATTERN = Pattern.compile(
"(/org/asamk/Signal/.*Groups/[a-zA-Z0-9/_+-]{2}|[a-zA-Z0-9/_+-]{2})([a-zA-Z0-9/_+-]{18})([a-zA-Z0-9/_+-]{2})(==|__)");
private static final String GROUP_V1_ID_CENSOR = "*".repeat(18);
private static final Pattern GROUP_V2_ID_PATTERN = Pattern.compile(
"(/org/asamk/Signal/.*Groups/[a-zA-Z0-9/_+-]{2}|[a-zA-Z0-9/_+-]{2})([a-zA-Z0-9/_+-]{39})([a-zA-Z0-9/_+-]{2})([=_])");
private static final String GROUP_V2_ID_CENSOR = "*".repeat(39);
/**
* The second group will be censored.
*/
private static final Pattern CRUDE_EMAIL_PATTERN = Pattern.compile("\\b([^\\s/])([^\\s/]*@[^\\s]+)");
private static final String EMAIL_CENSOR = "...@...";
/**
* The middle group will be censored.
*/
private static final Pattern UUID_PATTERN = Pattern.compile(
"(JOB::)?([0-9a-f]{8}[-_][0-9a-f]{4}[-_][0-9a-f]{4}[-_][0-9a-f]{4}[-_][0-9a-f]{10})([0-9a-f]{2})",
Pattern.CASE_INSENSITIVE);
private static final String UUID_CENSOR = "********-****-****-****-**********";
/**
* The entire string is censored.
*/
private static final Pattern IPV4_PATTERN = Pattern.compile("\\b"
+ "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\."
+ "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\."
+ "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\."
+ "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)"
+ "\\b");
private static final String IPV4_CENSOR = "...ipv4...";
/**
* The domain name except for TLD will be censored.
*/
private static final Pattern DOMAIN_PATTERN = Pattern.compile("([a-z0-9]+\\.)+([a-z0-9\\-]*[a-z\\-][a-z0-9\\-]*)",
Pattern.CASE_INSENSITIVE);
private static final String DOMAIN_CENSOR = "***.";
private static final Set TOP_100_TLDS = new HashSet<>(Arrays.asList("com",
"net",
"org",
"jp",
"de",
"uk",
"fr",
"br",
"it",
"ru",
"es",
"me",
"gov",
"pl",
"ca",
"au",
"cn",
"co",
"in",
"nl",
"edu",
"info",
"eu",
"ch",
"id",
"at",
"kr",
"cz",
"mx",
"be",
"tv",
"se",
"tr",
"tw",
"al",
"ua",
"ir",
"vn",
"cl",
"sk",
"ly",
"cc",
"to",
"no",
"fi",
"us",
"pt",
"dk",
"ar",
"hu",
"tk",
"gr",
"il",
"news",
"ro",
"my",
"biz",
"ie",
"za",
"nz",
"sg",
"ee",
"th",
"io",
"xyz",
"pe",
"bg",
"hk",
"lt",
"link",
"ph",
"club",
"si",
"site",
"mobi",
"by",
"cat",
"wiki",
"la",
"ga",
"xxx",
"cf",
"hr",
"ng",
"jobs",
"online",
"kz",
"ug",
"gq",
"ae",
"is",
"lv",
"pro",
"fm",
"tips",
"ms",
"sa",
"app"));
public static CharSequence scrub(CharSequence in) {
in = scrubUuids(in);
in = scrubE164(in);
in = scrubEmail(in);
in = scrubGroupV2Ids(in);
in = scrubGroupV1Ids(in);
in = scrubDomains(in);
in = scrubIpv4(in);
return in;
}
private static CharSequence scrubE164(CharSequence in) {
return scrub(in,
E164_PATTERN,
(matcher, output) -> output.append(matcher.group(1))
.append(E164_CENSOR, 0, matcher.group(2).length())
.append(matcher.group(3)));
}
private static CharSequence scrubGroupV1Ids(CharSequence in) {
return scrub(in,
GROUP_V1_ID_PATTERN,
(matcher, output) -> output.append(matcher.group(1))
.append(GROUP_V1_ID_CENSOR, 0, matcher.group(2).length())
.append(matcher.group(3)));
}
private static CharSequence scrubGroupV2Ids(CharSequence in) {
return scrub(in,
GROUP_V2_ID_PATTERN,
(matcher, output) -> output.append(matcher.group(1))
.append(GROUP_V2_ID_CENSOR, 0, matcher.group(2).length())
.append(matcher.group(3)));
}
private static CharSequence scrubEmail(CharSequence in) {
return scrub(in,
CRUDE_EMAIL_PATTERN,
(matcher, output) -> output.append(matcher.group(1)).append(EMAIL_CENSOR));
}
private static CharSequence scrubUuids(CharSequence in) {
return scrub(in, UUID_PATTERN, (matcher, output) -> {
if (matcher.group(1) != null && !matcher.group(1).isEmpty()) {
output.append(matcher.group(1)).append(matcher.group(2)).append(matcher.group(3));
} else {
output.append(UUID_CENSOR).append(matcher.group(3));
}
});
}
private static CharSequence scrubDomains(CharSequence in) {
return scrub(in, DOMAIN_PATTERN, (matcher, output) -> {
String match = matcher.group(0);
if (matcher.groupCount() == 2
&& TOP_100_TLDS.contains(matcher.group(2).toLowerCase(Locale.US))
&& !match.endsWith("whispersystems.org")
&& !match.endsWith("signal.org")) {
output.append(DOMAIN_CENSOR).append(matcher.group(2));
} else {
output.append(match);
}
});
}
private static CharSequence scrubIpv4(CharSequence in) {
return scrub(in, IPV4_PATTERN, (matcher, output) -> output.append(IPV4_CENSOR));
}
private static CharSequence scrub(
CharSequence in, Pattern pattern, ProcessMatch processMatch
) {
final StringBuilder output = new StringBuilder(in.length());
final Matcher matcher = pattern.matcher(in);
int lastEndingPos = 0;
while (matcher.find()) {
output.append(in, lastEndingPos, matcher.start());
processMatch.scrubMatch(matcher, output);
lastEndingPos = matcher.end();
}
if (lastEndingPos == 0) {
// there were no matches, save copying all the data
return in;
} else {
output.append(in, lastEndingPos, in.length());
return output;
}
}
private interface ProcessMatch {
void scrubMatch(Matcher matcher, StringBuilder output);
}
}