Detect duplicate or near-duplicate content using SimHash and Jaccard similarity
Detect duplicate and near-duplicate content across documents, articles, and data records. Content deduplication uses SimHash and similarity scoring to identify copies, paraphrases, and overlapping content — essential for content moderation platforms, data pipelines, and publishing workflows that need to maintain originality.
X-API-Key header with every request.
All requests go through the API gateway which handles authentication, rate limiting, and usage tracking.
{
"documents": [
{
"id": "doc1",
"text": "The quick brown fox jumps over the lazy dog"
},
{
"id": "doc2",
"text": "The fast brown fox leaps over the lazy dog"
}
],
"threshold": 0.8
}
| Field | Type | Description |
|---|---|---|
documents |
array |
Array of items |
threshold |
number |
Field value |
{
"duplicates": [
{
"doc1": "doc1",
"doc2": "doc2",
"similarity": 0.87,
"method": "simhash"
}
],
"totalDocuments": 2,
"duplicateCount": 1
}
| Field | Type | Description |
|---|---|---|
duplicates |
array |
Array of items |
totalDocuments |
integer |
Field value |
duplicateCount |
integer |
Field value |
| Status | Meaning |
|---|---|
200 | Request completed successfully |
400 | Bad request — invalid or missing parameters |
401 | Missing or invalid X-API-Key header |
429 | Rate limit exceeded — check Retry-After header |
500 | Internal server error |
400 Empty datasetRequest that triggers this:
{"items": []}
Error response:
{"type": "/problems/validation-error", "title": "Empty Dataset", "status": 400, "detail": "Items list cannot be empty"}
How to fix: Provide at least one item in the items array for deduplication.
413 Dataset too largeRequest that triggers this:
{"items": "[1 million items]"}
Error response:
{"type": "/problems/payload-too-large", "title": "Payload Too Large", "status": 413, "detail": "Maximum 10000 items per request"}
How to fix: Split large datasets into multiple requests with max 10000 items per request.
curl -X POST /v1/content/dedup \
-H "Content-Type: application/json" \
-H "X-API-Key: YOUR_API_KEY" \
-d '{
"documents": [
{
"id": "doc1",
"text": "The quick brown fox jumps over the lazy dog"
},
{
"id": "doc2",
"text": "The fast brown fox leaps over the lazy dog"
}
],
"threshold": 0.8
}'
// Node.js (18+) or modern browser
const response = await fetch("/v1/content/dedup", {
method: "POST",
headers: {
"X-API-Key": "YOUR_API_KEY",
"Content-Type": "application/json",
},
body: JSON.stringify({
"documents": [
{
"id": "doc1",
"text": "The quick brown fox jumps over the lazy dog"
},
{
"id": "doc2",
"text": "The fast brown fox leaps over the lazy dog"
}
],
"threshold": 0.8
}),
});
const data = await response.json();
console.log(response.status, data);
import requests
response = requests.post(
"/v1/content/dedup",
headers={
"X-API-Key": "YOUR_API_KEY",
"Content-Type": "application/json",
},
json={
"documents": [
{
"id": "doc1",
"text": "The quick brown fox jumps over the lazy dog"
},
{
"id": "doc2",
"text": "The fast brown fox leaps over the lazy dog"
}
],
"threshold": 0.8
},
)
print(response.status_code)
print(response.json())
package main
import (
"fmt"
"io"
"net/http"
"strings"
)
func main() {
body := strings.NewReader(`{
"documents": [
{
"id": "doc1",
"text": "The quick brown fox jumps over the lazy dog"
},
{
"id": "doc2",
"text": "The fast brown fox leaps over the lazy dog"
}
],
"threshold": 0.8
}`)
req, _ := http.NewRequest("POST", "/v1/content/dedup", body)
req.Header.Set("X-API-Key", "YOUR_API_KEY")
req.Header.Set("Content-Type", "application/json")
resp, err := http.DefaultClient.Do(req)
if err != nil {
panic(err)
}
defer resp.Body.Close()
data, _ := io.ReadAll(resp.Body)
fmt.Println(resp.StatusCode)
fmt.Println(string(data))
}
{
"name": "deduplication",
"description": "Detect duplicate or near-duplicate content using SimHash and Jaccard similarity",
"inputSchema": {
"type": "object",
"properties": {
"api_key": {"type": "string", "description": "Your Orovai API key"},
"request": {"type": "object", "description": "Request body"}
},
"required": ["api_key", "request"]
},
"endpoint": "/v1/content/dedup",
"method": "POST",
"headers": {
"X-API-Key": "{{api_key}}",
"Content-Type": "application/json"
}
}