A library that removes common unicode confusables/homoglyphs from strings.
- Its core is written in Rust and utilizes a form of Binary Search to ensure speed!
- By default, it's capable of filtering 221,529 (19.88%) different unicode codepoints like:
- All whitespace characters
- All diacritics, this also eliminates all forms of Zalgo text
- Most leetspeak characters
- Most homoglyphs
- Several emojis
- Unlike other packages, this package is unicode bidi-aware where it also interprets right-to-left characters in the same way as it were to be rendered by an application!
- Its behavior is also highly customizable to your liking!
- And it's available in the following languages:
Rust (v1.65 or later)
In your Cargo.toml
:
decancer = "3.3.3"
JavaScript (Node.js)
In your shell:
$ npm install decancer
In your code (CommonJS):
const decancer = require('decancer')
In your code (ESM):
import decancer from 'decancer'
JavaScript (Browser)
In your code:
<script type="module">
import init from 'https://cdn.jsdelivr.net/gh/null8626/decancer@v3.3.3/bindings/wasm/bin/decancer.min.js'
const decancer = await init()
</script>
Java
You can download the latest JAR file here.
In your build.gradle
:
repositories {
mavenCentral()
maven { url 'https://jitpack.io' }
}
dependencies {
implementation 'io.github.null8626:decancer:3.3.3'
}
In your pom.xml
:
<repositories>
<repository>
<id>central</id>
<url>https://repo.maven.apache.org/maven2</url>
</repository>
<repository>
<id>jitpack.io</id>
<url>https://jitpack.io</url>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>io.github.null8626</groupId>
<artifactId>decancer</artifactId>
<version>3.3.3</version>
</dependency>
</dependencies>
Windows:
> git clone https://github.com/null8626/decancer.git --branch v3.3.3 --depth 1
> cd decancer/bindings/java
> powershell -NoLogo -NoProfile -NonInteractive -Command "Expand-Archive -Path .\bin\bindings.zip -DestinationPath .\bin -Force"
> gradle build -x test
macOS/Linux:
$ git clone https://github.com/null8626/decancer.git --branch v3.3.3 --depth 1
$ cd decancer/bindings/java
$ unzip ./bin/bindings.zip -d ./bin
$ chmod +x ./gradlew
$ ./gradlew build -x test
Tip: You can shrink the size of the resulting JAR file by removing binaries in the bin
directory for the platforms you don't want to support.
C/C++
- Header file
- Download for ARM64 macOS (11.0+, Big Sur+)
- Download for ARM64 iOS
- Download for Apple iOS Simulator on ARM6
- Download for ARM64 Android
- Download for ARM64 Windows MSVC
- Download for ARM64 Linux (kernel 4.1, glibc 2.17+)
- Download for ARM64 Linux with MUSL
- Download for ARMv6 Linux (kernel 3.2, glibc 2.17)
- Download for ARMv5TE Linux (kernel 4.4, glibc 2.23)
- Download for ARMv7-A Android
- Download for ARMv7-A Linux (kernel 4.15, glibc 2.27)
- Download for ARMv7-A Linux, hardfloat (kernel 3.2, glibc 2.17)
- Download for 32-bit Linux w/o SSE (kernel 3.2, glibc 2.17)
- Download for 32-bit MSVC (Windows 7+)
- Download for 32-bit FreeBSD
- Download for 32-bit Linux (kernel 3.2+, glibc 2.17+)
- Download for PPC64LE Linux (kernel 3.10, glibc 2.17)
- Download for RISC-V Linux (kernel 4.20, glibc 2.29)
- Download for S390x Linux (kernel 3.2, glibc 2.17)
- Download for SPARC Solaris 11, illumos
- Download for Thumb2-mode ARMv7-A Linux with NEON (kernel 4.4, glibc 2.23)
- Download for 64-bit macOS (10.12+, Sierra+)
- Download for 64-bit iOS
- Download for 64-bit MSVC (Windows 7+)
- Download for 64-bit FreeBSD
- Download for 64-bit illumos
- Download for 64-bit Linux (kernel 3.2+, glibc 2.17+)
- Download for 64-bit Linux with MUSL
Building from source requires Rust v1.65 or later.
$ git clone https://github.com/null8626/decancer.git --branch v3.3.3 --depth 1
$ cd decancer/bindings/native
$ cargo build --release
And the binary files should be generated in the target/release
directory.
Go (v1.17 or later)
Building requires Rust v1.65 or later. Windows systems also require a MinGW compiler to be readily available.
In your shell:
$ git clone https://github.com/null8626/decancer.git --branch v3.3.3 --depth 1
$ cd decancer/bindings/go
$ sudo -E "PATH=$PATH" go generate
$ go install
For most platforms, go generate
will require elevated administrator permissions as decancer's native binding will be added to your system's libraries for convenience.
Rust
For more information, please read the documentation.
let mut cured = decancer::cure!(r"vοΌ₯β‘π π½πΕβο½ Ε£δΉππ£ wWiIiIIttHh l133t5p3/-\|<").unwrap();
assert_eq!(cured, "very funny text with leetspeak");
// WARNING: it's NOT recommended to coerce this output to a Rust string
// and process it manually from there, as decancer has its own
// custom comparison measures, including leetspeak matching!
assert_ne!(cured.as_str(), "very funny text with leetspeak");
assert!(cured.contains("funny"));
cured.censor("funny", '*');
assert_eq!(cured, "very ***** text with leetspeak");
cured.censor_multiple(["very", "text"], '-');
assert_eq!(cured, "---- ***** ---- with leetspeak");
JavaScript (Node.js)
const assert = require('assert')
const cured = decancer('vοΌ₯β‘π π½πΕβο½ Ε£δΉππ£ wWiIiIIttHh l133t5p3/-\\|<')
assert(cured.equals('very funny text with leetspeak'))
// WARNING: it's NOT recommended to coerce this output to a JavaScript string
// and process it manually from there, as decancer has its own
// custom comparison measures, including leetspeak matching!
assert(cured.toString() !== 'very funny text with leetspeak')
console.log(cured.toString())
// => very funny text wwiiiiitthh l133t5p3/-\|<
assert(cured.contains('funny'))
cured.censor('funny', '*')
console.log(cured.toString())
// => very ***** text wwiiiiitthh l133t5p3/-\|<
cured.censorMultiple(['very', 'text'], '-')
console.log(cured.toString())
// => ---- ***** ---- wwiiiiitthh l133t5p3/-\|<
JavaScript (Browser)
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8" />
<title>Decancerer!!! (tm)</title>
<style>
textarea {
font-size: 30px;
}
#cure {
font-size: 20px;
padding: 5px 30px;
}
</style>
</head>
<body>
<h3>Input cancerous text here:</h3>
<textarea rows="10" cols="30"></textarea>
<br />
<button id="cure" onclick="cure()">cure!</button>
<script type="module">
import init from 'https://cdn.jsdelivr.net/gh/null8626/decancer@v3.3.3/bindings/wasm/bin/decancer.min.js'
const decancer = await init()
window.cure = function () {
const textarea = document.querySelector('textarea')
if (!textarea.value.length) {
return alert("There's no text!!!")
}
textarea.value = decancer(textarea.value).toString()
}
</script>
</body>
</html>
Java
For more information, please read the documentation.
import io.github.null8626.decancer.CuredString;
public class Program {
public static void main(String[] args) {
try (final CuredString cured = new CuredString("vοΌ₯β‘π π½πΕβο½ Ε£δΉππ£ wWiIiIIttHh l133t5p3/-\\|<")) {
assert cured.equals("very funny text with leetspeak");
// WARNING: it's NOT recommended to coerce this output to a Java String
// and process it manually from there, as decancer has its own
// custom comparison measures, including leetspeak matching!
assert !cured.toString().equals("very funny text with leetspeak");
System.out.println(cured.toString());
// => very funny text wwiiiiitthh l133t5p3/-\|<
assert cured.contains("funny");
cured.censor("funny", '*');
System.out.println(cured.toString());
// => very ***** text wwiiiiitthh l133t5p3/-\|<
String[] keywords = { "very", "text" };
cured.censorMultiple(keywords, '-');
System.out.println(cured.toString());
// => ---- ***** ---- wwiiiiitthh l133t5p3/-\|<
}
}
}
C/C++
For more information, please read the documentation.
UTF-8 example:
#include <decancer.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#define decancer_assert(expr, notes) \
if (!(expr)) { \
fprintf(stderr, "assertion failure at " notes "\n"); \
ret = 1; \
goto END; \
}
int main() {
int ret = 0;
// UTF-8 bytes for "vοΌ₯β‘π π½πΕβο½ Ε£δΉππ£"
uint8_t input[] = {0x76, 0xef, 0xbc, 0xa5, 0xe2, 0x93, 0xa1, 0xf0, 0x9d, 0x94, 0x82, 0x20, 0xf0, 0x9d,
0x94, 0xbd, 0xf0, 0x9d, 0x95, 0x8c, 0xc5, 0x87, 0xe2, 0x84, 0x95, 0xef, 0xbd, 0x99,
0x20, 0xc5, 0xa3, 0xe4, 0xb9, 0x87, 0xf0, 0x9d, 0x95, 0x8f, 0xf0, 0x9d, 0x93, 0xa3};
decancer_error_t error;
decancer_cured_t cured = decancer_cure(input, sizeof(input), DECANCER_OPTION_DEFAULT, &error);
if (cured == NULL) {
fprintf(stderr, "curing error: %.*s\n", (int)error.message_length, error.message);
return 1;
}
decancer_assert(decancer_contains(cured, "funny", 5), "decancer_contains");
END:
decancer_cured_free(cured);
return ret;
}
UTF-16 example:
#include <decancer.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#define decancer_assert(expr, notes) \
if (!(expr)) { \
fprintf(stderr, "assertion failure at " notes "\n"); \
ret = 1; \
goto END; \
}
int main() {
int ret = 0;
// UTF-16 bytes for "vοΌ₯β‘π π½πΕβο½ Ε£δΉππ£"
uint16_t input[] = {
0x0076, 0xff25, 0x24e1,
0xd835, 0xdd02, 0x0020,
0xd835, 0xdd3d, 0xd835,
0xdd4c, 0x0147, 0x2115,
0xff59, 0x0020, 0x0163,
0x4e47, 0xd835, 0xdd4f,
0xd835, 0xdce3
};
// UTF-16 bytes for "funny"
uint16_t funny[] = { 0x66, 0x75, 0x6e, 0x6e, 0x79 };
decancer_error_t error;
decancer_cured_t cured = decancer_cure_utf16(input, sizeof(input) / sizeof(uint16_t), DECANCER_OPTION_DEFAULT, &error);
if (cured == NULL) {
fprintf(stderr, "curing error: %.*s\n", (int)error.message_length, error.message);
return 1;
}
decancer_assert(decancer_contains_utf16(cured, funny, sizeof(funny) / sizeof(uint16_t)), "decancer_contains_utf16");
END:
decancer_cured_free(cured);
return ret;
}
Go
package main
import (
"os"
"fmt"
"strconv"
"github.com/null8626/decancer/bindings/go"
)
func main() {
cured, err := decancer.Cure("vοΌ₯β‘π π½πΕβο½ Ε£δΉππ£", decancer.Default)
if err != nil {
fmt.Fprintln(os.Stderr, "error:", err)
os.Exit(1)
}
defer cured.Close()
fmt.Println(cured.String())
if cured.Equals("very funny text") {
fmt.Println("it is indeed a very funny text")
}
if cured.StartsWith("very") {
fmt.Println("it starts with 'very'")
}
if cured.EndsWith("text") {
fmt.Println("it ends with 'text'")
}
if cured.Contains("funny") {
fmt.Println("it has the funny")
}
funnyMatches := cured.Find("funny")
fmt.Println("funny counter:")
for i, match := range funnyMatches {
fmt.Println("Match " + strconv.Itoa(i) + ":")
fmt.Println(" - start: " + strconv.Itoa(match.Start))
fmt.Println(" - end: " + strconv.Itoa(match.End))
}
keywords := []string{"very", "funny"}
veryFunnyMatches, err := cured.FindMultiple(keywords)
if err != nil {
fmt.Fprintln(os.Stderr, "error:", err)
os.Exit(1)
}
fmt.Println("very funny counter:")
for i, match := range veryFunnyMatches {
fmt.Println("Match " + strconv.Itoa(i) + ":")
fmt.Println(" - start: " + strconv.Itoa(match.Start))
fmt.Println(" - end: " + strconv.Itoa(match.End))
}
}
If you want to support my eyes for manually looking at thousands of unicode characters, consider donating! β€
Please read CONTRIBUTING.md
for newbie contributors who want to contribute!