SearchAPI Example: Search for Student IDs

The SearchAPI in this example looks for student IDs. It follows the standard two-stage model: regex captures candidates; C++ strictly validates + normalizes.

The SearchAPI example below searches for student IDs.

  • The format is always "UMA" followed by six numeric digits and then the month and year issued.
    • UMA + 6 digits + 2-digit month + 2-digit year
    • Example: UMA5456640124, where '01' is for the month of January and '24' is for the year 2024

The content below is a drop-in SearchDLL-style SearchAPI detector for UMA student IDs in this format.

It follows the standard two-stage model:

  1. Regex captures candidates
  2. C++ strictly validates and normalizes.


Stage 1 - RegEx (GetSearchItemData)

This regex looks for UMA and then 10 digits, allowing optional whitespace between digits to survive OCR splits (e.g., UMA545664 01 24):

(?i)(?<![A-Z0-9])UMA(?:\s*\d){10}(?![A-Z0-9])
  • (?i) case-insensitive
  • (?<![A-Z0-9]) and (?![A-Z0-9]) reduce partial/embedded matches
  • (?:\s*\d){10} allows spaces/new OCR tokenization between digits, but still requires 10 digits total


Stage 2 - C++ Strictly Validates and Normalizes

Example SearchDLL.cpp - Used to Discover UMA Student IDs

Replace RESULT_TYPE with your assigned custom range ID (whatever your console mapping expects).

// SearchDLL.cpp - UMA Student ID detector
// Detects: UMA + 6 digits + MM + YY (total: UMA + 10 digits)
// Example: UMA5456640124 -> student=545664, month=01, year=24

#include <string>
#include <vector>

#ifdef _WIN32
#include <tchar.h>
#endif

// ---- Spirion/Identity Finder SearchDLL types ----
// In your real project these come from Spirion headers.
// Keep these declarations only if your project template doesn't already define them.
struct SearchInfo
{
const TCHAR* displayName;
};

struct SearchItemData
{
SearchInfo searchInfo;
unsigned int resultType;
const TCHAR* data; // regex/keyword
unsigned int dataType; // 1 = regex, 2 = keyword
};

// ---- Configuration ----
#define CUSTOM_SEARCH_NAME _T("UMA_StudentID_Detector")
#define RESULT_TYPE 13750 // TODO: set your custom ResultType ID (must match your console/config mapping)

// ---------------- Helpers ----------------
static inline bool IsDigit(wchar_t c) { return c >= L'0' && c <= L'9'; }
static inline bool IsAlpha(wchar_t c) { return (c >= L'A' && c <= L'Z') || (c >= L'a' && c <= L'z'); }
static inline bool IsAlnum(wchar_t c) { return IsDigit(c) || IsAlpha(c); }

static inline wchar_t ToUpperChar(wchar_t c)
{
if (c >= L'a' && c <= L'z') return static_cast<wchar_t>(c - (L'a' - L'A'));
return c;
}

static std::wstring ToUpperCopy(const std::wstring& s)
{
std::wstring out(s);
for (size_t i = 0; i < out.size(); ++i) out[i] = ToUpperChar(out[i]);
return out;
}

// Keep only letters and digits, uppercase.
static std::wstring NormalizeAlnumUpper(const std::wstring& s)
{
std::wstring out;
out.reserve(s.size());
for (wchar_t c : s)
{
if (IsAlnum(c))
out.push_back(ToUpperChar(c));
}
return out;
}

// Tokenize into contiguous alphanumeric chunks.
static std::vector<std::wstring> TokenizeAlnumUpper(const std::wstring& textUpper)
{
std::vector<std::wstring> tokens;
std::wstring cur;
for (size_t i = 0; i <= textUpper.size(); ++i)
{
wchar_t c = (i < textUpper.size()) ? textUpper[i] : L' ';
if (IsAlnum(c))
{
cur.push_back(c);
}
else
{
if (!cur.empty())
{
tokens.push_back(cur);
cur.clear();
}
}
}
return tokens;
}

static bool StartsWith(const std::wstring& s, const std::wstring& prefix)
{
return s.size() >= prefix.size() && s.compare(0, prefix.size(), prefix) == 0;
}

// Validate UMA + 10 digits, with month 01-12.
// Layout: UMA [0..2], student digits [3..8], month [9..10], year [11..12]
static bool IsValidUmaStudentId(const std::wstring& normalized)
{
// normalized should already be uppercase alnum only
if (normalized.size() != 13) return false;
if (!StartsWith(normalized, L"UMA")) return false;

// Must be UMA + 10 digits
for (size_t i = 3; i < 13; ++i)
{
if (!IsDigit(normalized[i])) return false;
}

int monthTens = normalized[9] - L'0';
int monthOnes = normalized[10] - L'0';
int month = monthTens * 10 + monthOnes;
if (month < 1 || month > 12) return false;

// Year is 2 digits (00-99). You can tighten this if you want (e.g., >= 20).
// int year = (normalized[11]-L'0')*10 + (normalized[12]-L'0');

return true;
}

// Extract best candidate from match text.
// Handles OCR like: "UMA 545664 01 24" by merging tokens.
static bool ExtractUmaCandidate(const std::wstring& matchText, std::wstring& outNormalized)
{
std::wstring upper = ToUpperCopy(matchText);
std::vector<std::wstring> toks = TokenizeAlnumUpper(upper);

// Try merging up to 4 adjacent tokens to reconstruct the ID.
// (UMA) + (digits...) often splits into multiple tokens.
for (size_t i = 0; i < toks.size(); ++i)
{
std::wstring merged;

for (size_t k = 0; k < 4 && (i + k) < toks.size(); ++k)
{
merged += toks[i + k];
std::wstring norm = NormalizeAlnumUpper(merged);

// We can early-skip if it doesn't start with UMA and is already length >=3
if (norm.size() >= 3 && !StartsWith(norm, L"UMA"))
break;

// Only validate when length is exactly 13; anything longer should stop.
if (norm.size() == 13)
{
if (IsValidUmaStudentId(norm))
{
outNormalized = norm;
return true;
}
// If length == 13 but invalid, keep trying other starts, but don't extend further.
break;
}
if (norm.size() > 13)
break;
}
}

// Fallback: sometimes the whole match is already near-clean.
std::wstring normAll = NormalizeAlnumUpper(upper);
if (normAll.size() >= 13)
{
// Scan within normalized text for a valid 13-char UMA ID
for (size_t pos = 0; pos + 13 <= normAll.size(); ++pos)
{
if (normAll[pos] == L'U' && pos + 3 <= normAll.size() && normAll.compare(pos, 3, L"UMA") == 0)
{
std::wstring cand = normAll.substr(pos, 13);
if (IsValidUmaStudentId(cand))
{
outNormalized = cand;
return true;
}
}
}
}

return false;
}

// Central validation used by DoTest / DoTestEx
static bool ValidateMatch(const std::wstring& matchText)
{
std::wstring norm;
if (!ExtractUmaCandidate(matchText, norm))
return false;

return IsValidUmaStudentId(norm);
}

// ---------------- Exported functions ----------------
extern "C" __declspec(dllexport)
const TCHAR* GetDisplayName(void)
{
return CUSTOM_SEARCH_NAME;
}

extern "C" __declspec(dllexport)
unsigned int GetResultType(void)
{
return RESULT_TYPE;
}

extern "C" __declspec(dllexport)
void GetSearchItemData(SearchItemData*& pData)
{
pData = new SearchItemData();
pData->searchInfo.displayName = CUSTOM_SEARCH_NAME;
pData->resultType = RESULT_TYPE;

// Regex: UMA + 10 digits, allowing optional whitespace between digits.
// Matches: UMA5456640124, UMA 545664 01 24, UMA 5 4 5 6 6 4 0 1 2 4
pData->data = _T("(?i)(?<![A-Z0-9])UMA(?:\\s*\\d){10}(?![A-Z0-9])");
pData->dataType = 1; // regex
}

extern "C" __declspec(dllexport)
void DeleteSearchItemData(SearchItemData* pData)
{
if (pData)
{
delete pData;
pData = nullptr;
}
}

// The engine may call DoTest (no location) or DoTestEx (with location + file data)
extern "C" __declspec(dllexport)
bool DoTest(const std::wstring& x, const std::wstring* /*fileDataPtr*/)
{
return ValidateMatch(x);
}

extern "C" __declspec(dllexport)
bool DoTestEx(const std::wstring& x, const std::wstring* /*fileDataPtr*/, std::wstring::size_type /*location*/)
{
return ValidateMatch(x);
}

extern "C" __declspec(dllexport)
bool Clean(const std::wstring& x, std::wstring*& result)
{
result = nullptr;
try
{
std::wstring norm;
if (!ExtractUmaCandidate(x, norm))
return false;

// Canonical value stored: UMA + 10 digits (no spaces)
result = new std::wstring(norm);
return true;
}
catch (...)
{
return false;
}
}

extern "C" __declspec(dllexport)
bool FreeCleanedResult(std::wstring*& result)
{
try
{
if (result)
{
delete result;
result = nullptr;
}
return true;
}
catch (...)
{
return false;
}
}


Test strings (quick QA list)

Should match

  • UMA5456640124
  • uma5456640124
  • UMA 545664 01 24
  • Student ID: UMA5456640124
  • UMA 5 4 5 6 6 4 0 1 2 4

Should NOT match

  • UMA5456641324 (month 13 invalid)
  • UMA5456640024 (month 00 invalid)
  • UMA545660124 (too short)
  • XUMA5456640124 (blocked by boundary)
  • UMA54566401245 (too long)


Was this article helpful?