SearchAPI Example: Search for Student IDs
The SearchAPI in this example looks for student IDs. It follows the standard two-stage model: regex captures candidates; C++ strictly validates + normalizes.
The SearchAPI example below searches for student IDs.
- The format is always "UMA" followed by six numeric digits and then the month and year issued.
UMA+ 6 digits + 2-digit month + 2-digit year- Example:
UMA5456640124, where '01' is for the month of January and '24' is for the year 2024
The content below is a drop-in SearchDLL-style SearchAPI detector for UMA student IDs in this format.
It follows the standard two-stage model:
- Regex captures candidates
- C++ strictly validates and normalizes.
Stage 1 - RegEx (GetSearchItemData)
This regex looks for UMA and then 10 digits, allowing optional whitespace between digits to survive OCR splits (e.g., UMA545664 01 24):
(?i)(?<![A-Z0-9])UMA(?:\s*\d){10}(?![A-Z0-9])(?i)case-insensitive(?<![A-Z0-9])and(?![A-Z0-9])reduce partial/embedded matches(?:\s*\d){10}allows spaces/new OCR tokenization between digits, but still requires 10 digits total
Stage 2 - C++ Strictly Validates and Normalizes
Example SearchDLL.cpp - Used to Discover UMA Student IDs
Replace
RESULT_TYPEwith your assigned custom range ID (whatever your console mapping expects).
// SearchDLL.cpp - UMA Student ID detector
// Detects: UMA + 6 digits + MM + YY (total: UMA + 10 digits)
// Example: UMA5456640124 -> student=545664, month=01, year=24
#include <string>
#include <vector>
#ifdef _WIN32
#include <tchar.h>
#endif
// ---- Spirion/Identity Finder SearchDLL types ----
// In your real project these come from Spirion headers.
// Keep these declarations only if your project template doesn't already define them.
struct SearchInfo
{
const TCHAR* displayName;
};
struct SearchItemData
{
SearchInfo searchInfo;
unsigned int resultType;
const TCHAR* data; // regex/keyword
unsigned int dataType; // 1 = regex, 2 = keyword
};
// ---- Configuration ----
#define CUSTOM_SEARCH_NAME _T("UMA_StudentID_Detector")
#define RESULT_TYPE 13750 // TODO: set your custom ResultType ID (must match your console/config mapping)
// ---------------- Helpers ----------------
static inline bool IsDigit(wchar_t c) { return c >= L'0' && c <= L'9'; }
static inline bool IsAlpha(wchar_t c) { return (c >= L'A' && c <= L'Z') || (c >= L'a' && c <= L'z'); }
static inline bool IsAlnum(wchar_t c) { return IsDigit(c) || IsAlpha(c); }
static inline wchar_t ToUpperChar(wchar_t c)
{
if (c >= L'a' && c <= L'z') return static_cast<wchar_t>(c - (L'a' - L'A'));
return c;
}
static std::wstring ToUpperCopy(const std::wstring& s)
{
std::wstring out(s);
for (size_t i = 0; i < out.size(); ++i) out[i] = ToUpperChar(out[i]);
return out;
}
// Keep only letters and digits, uppercase.
static std::wstring NormalizeAlnumUpper(const std::wstring& s)
{
std::wstring out;
out.reserve(s.size());
for (wchar_t c : s)
{
if (IsAlnum(c))
out.push_back(ToUpperChar(c));
}
return out;
}
// Tokenize into contiguous alphanumeric chunks.
static std::vector<std::wstring> TokenizeAlnumUpper(const std::wstring& textUpper)
{
std::vector<std::wstring> tokens;
std::wstring cur;
for (size_t i = 0; i <= textUpper.size(); ++i)
{
wchar_t c = (i < textUpper.size()) ? textUpper[i] : L' ';
if (IsAlnum(c))
{
cur.push_back(c);
}
else
{
if (!cur.empty())
{
tokens.push_back(cur);
cur.clear();
}
}
}
return tokens;
}
static bool StartsWith(const std::wstring& s, const std::wstring& prefix)
{
return s.size() >= prefix.size() && s.compare(0, prefix.size(), prefix) == 0;
}
// Validate UMA + 10 digits, with month 01-12.
// Layout: UMA [0..2], student digits [3..8], month [9..10], year [11..12]
static bool IsValidUmaStudentId(const std::wstring& normalized)
{
// normalized should already be uppercase alnum only
if (normalized.size() != 13) return false;
if (!StartsWith(normalized, L"UMA")) return false;
// Must be UMA + 10 digits
for (size_t i = 3; i < 13; ++i)
{
if (!IsDigit(normalized[i])) return false;
}
int monthTens = normalized[9] - L'0';
int monthOnes = normalized[10] - L'0';
int month = monthTens * 10 + monthOnes;
if (month < 1 || month > 12) return false;
// Year is 2 digits (00-99). You can tighten this if you want (e.g., >= 20).
// int year = (normalized[11]-L'0')*10 + (normalized[12]-L'0');
return true;
}
// Extract best candidate from match text.
// Handles OCR like: "UMA 545664 01 24" by merging tokens.
static bool ExtractUmaCandidate(const std::wstring& matchText, std::wstring& outNormalized)
{
std::wstring upper = ToUpperCopy(matchText);
std::vector<std::wstring> toks = TokenizeAlnumUpper(upper);
// Try merging up to 4 adjacent tokens to reconstruct the ID.
// (UMA) + (digits...) often splits into multiple tokens.
for (size_t i = 0; i < toks.size(); ++i)
{
std::wstring merged;
for (size_t k = 0; k < 4 && (i + k) < toks.size(); ++k)
{
merged += toks[i + k];
std::wstring norm = NormalizeAlnumUpper(merged);
// We can early-skip if it doesn't start with UMA and is already length >=3
if (norm.size() >= 3 && !StartsWith(norm, L"UMA"))
break;
// Only validate when length is exactly 13; anything longer should stop.
if (norm.size() == 13)
{
if (IsValidUmaStudentId(norm))
{
outNormalized = norm;
return true;
}
// If length == 13 but invalid, keep trying other starts, but don't extend further.
break;
}
if (norm.size() > 13)
break;
}
}
// Fallback: sometimes the whole match is already near-clean.
std::wstring normAll = NormalizeAlnumUpper(upper);
if (normAll.size() >= 13)
{
// Scan within normalized text for a valid 13-char UMA ID
for (size_t pos = 0; pos + 13 <= normAll.size(); ++pos)
{
if (normAll[pos] == L'U' && pos + 3 <= normAll.size() && normAll.compare(pos, 3, L"UMA") == 0)
{
std::wstring cand = normAll.substr(pos, 13);
if (IsValidUmaStudentId(cand))
{
outNormalized = cand;
return true;
}
}
}
}
return false;
}
// Central validation used by DoTest / DoTestEx
static bool ValidateMatch(const std::wstring& matchText)
{
std::wstring norm;
if (!ExtractUmaCandidate(matchText, norm))
return false;
return IsValidUmaStudentId(norm);
}
// ---------------- Exported functions ----------------
extern "C" __declspec(dllexport)
const TCHAR* GetDisplayName(void)
{
return CUSTOM_SEARCH_NAME;
}
extern "C" __declspec(dllexport)
unsigned int GetResultType(void)
{
return RESULT_TYPE;
}
extern "C" __declspec(dllexport)
void GetSearchItemData(SearchItemData*& pData)
{
pData = new SearchItemData();
pData->searchInfo.displayName = CUSTOM_SEARCH_NAME;
pData->resultType = RESULT_TYPE;
// Regex: UMA + 10 digits, allowing optional whitespace between digits.
// Matches: UMA5456640124, UMA 545664 01 24, UMA 5 4 5 6 6 4 0 1 2 4
pData->data = _T("(?i)(?<![A-Z0-9])UMA(?:\\s*\\d){10}(?![A-Z0-9])");
pData->dataType = 1; // regex
}
extern "C" __declspec(dllexport)
void DeleteSearchItemData(SearchItemData* pData)
{
if (pData)
{
delete pData;
pData = nullptr;
}
}
// The engine may call DoTest (no location) or DoTestEx (with location + file data)
extern "C" __declspec(dllexport)
bool DoTest(const std::wstring& x, const std::wstring* /*fileDataPtr*/)
{
return ValidateMatch(x);
}
extern "C" __declspec(dllexport)
bool DoTestEx(const std::wstring& x, const std::wstring* /*fileDataPtr*/, std::wstring::size_type /*location*/)
{
return ValidateMatch(x);
}
extern "C" __declspec(dllexport)
bool Clean(const std::wstring& x, std::wstring*& result)
{
result = nullptr;
try
{
std::wstring norm;
if (!ExtractUmaCandidate(x, norm))
return false;
// Canonical value stored: UMA + 10 digits (no spaces)
result = new std::wstring(norm);
return true;
}
catch (...)
{
return false;
}
}
extern "C" __declspec(dllexport)
bool FreeCleanedResult(std::wstring*& result)
{
try
{
if (result)
{
delete result;
result = nullptr;
}
return true;
}
catch (...)
{
return false;
}
}
Test strings (quick QA list)
Should match
UMA5456640124uma5456640124UMA 545664 01 24Student ID: UMA5456640124UMA 5 4 5 6 6 4 0 1 2 4
Should NOT match
UMA5456641324(month 13 invalid)UMA5456640024(month 00 invalid)UMA545660124(too short)XUMA5456640124(blocked by boundary)UMA54566401245(too long)