SearchAPI Example: Search for Student IDs

The SearchAPI in this example looks for student IDs. It follows the standard two-stage model: regex captures candidates; C++ strictly validates + normalizes.

The SearchAPI example below searches for student IDs.

The format is always "UMA" followed by six numeric digits and then the month and year issued.
- UMA + 6 digits + 2-digit month + 2-digit year
- Example: UMA5456640124, where '01' is for the month of January and '24' is for the year 2024

The content below is a drop-in SearchDLL-style SearchAPI detector for UMA student IDs in this format.

It follows the standard two-stage model:

Regex captures candidates
C++ strictly validates and normalizes.

Stage 1 - RegEx (GetSearchItemData)

This regex looks for UMA and then 10 digits, allowing optional whitespace between digits to survive OCR splits (e.g., UMA545664 01 24):

(?i)(?<![A-Z0-9])UMA(?:\s*\d){10}(?![A-Z0-9])

(?i) case-insensitive
(?<![A-Z0-9]) and (?![A-Z0-9]) reduce partial/embedded matches
(?:\s*\d){10} allows spaces/new OCR tokenization between digits, but still requires 10 digits total

Stage 2 - C++ Strictly Validates and Normalizes

Example SearchDLL.cpp - Used to Discover UMA Student IDs

Replace RESULT_TYPE with your assigned custom range ID (whatever your console mapping expects).

// SearchDLL.cpp - UMA Student ID detector
// Detects: UMA + 6 digits + MM + YY  (total: UMA + 10 digits)
// Example: UMA5456640124  -> student=545664, month=01, year=24

#include <string>
#include <vector>

#ifdef _WIN32
#include <tchar.h>
#endif

// ---- Spirion/Identity Finder SearchDLL types ----
// In your real project these come from Spirion headers.
// Keep these declarations only if your project template doesn't already define them.
struct SearchInfo
{
    const TCHAR* displayName;
};

struct SearchItemData
{
    SearchInfo    searchInfo;
    unsigned int  resultType;
    const TCHAR*  data;      // regex/keyword
    unsigned int  dataType;  // 1 = regex, 2 = keyword
};

// ---- Configuration ----
#define CUSTOM_SEARCH_NAME _T("UMA_StudentID_Detector")
#define RESULT_TYPE 13750  // TODO: set your custom ResultType ID (must match your console/config mapping)

// ---------------- Helpers ----------------
static inline bool IsDigit(wchar_t c) { return c >= L'0' && c <= L'9'; }
static inline bool IsAlpha(wchar_t c) { return (c >= L'A' && c <= L'Z') || (c >= L'a' && c <= L'z'); }
static inline bool IsAlnum(wchar_t c) { return IsDigit(c) || IsAlpha(c); }

static inline wchar_t ToUpperChar(wchar_t c)
{
    if (c >= L'a' && c <= L'z') return static_cast<wchar_t>(c - (L'a' - L'A'));
    return c;
}

static std::wstring ToUpperCopy(const std::wstring& s)
{
    std::wstring out(s);
    for (size_t i = 0; i < out.size(); ++i) out[i] = ToUpperChar(out[i]);
    return out;
}

// Keep only letters and digits, uppercase.
static std::wstring NormalizeAlnumUpper(const std::wstring& s)
{
    std::wstring out;
    out.reserve(s.size());
    for (wchar_t c : s)
    {
        if (IsAlnum(c))
            out.push_back(ToUpperChar(c));
    }
    return out;
}

// Tokenize into contiguous alphanumeric chunks.
static std::vector<std::wstring> TokenizeAlnumUpper(const std::wstring& textUpper)
{
    std::vector<std::wstring> tokens;
    std::wstring cur;
    for (size_t i = 0; i <= textUpper.size(); ++i)
    {
        wchar_t c = (i < textUpper.size()) ? textUpper[i] : L' ';
        if (IsAlnum(c))
        {
            cur.push_back(c);
        }
        else
        {
            if (!cur.empty())
            {
                tokens.push_back(cur);
                cur.clear();
            }
        }
    }
    return tokens;
}

static bool StartsWith(const std::wstring& s, const std::wstring& prefix)
{
    return s.size() >= prefix.size() && s.compare(0, prefix.size(), prefix) == 0;
}

// Validate UMA + 10 digits, with month 01-12.
// Layout: UMA [0..2], student digits [3..8], month [9..10], year [11..12]
static bool IsValidUmaStudentId(const std::wstring& normalized)
{
    // normalized should already be uppercase alnum only
    if (normalized.size() != 13) return false;
    if (!StartsWith(normalized, L"UMA")) return false;

    // Must be UMA + 10 digits
    for (size_t i = 3; i < 13; ++i)
    {
        if (!IsDigit(normalized[i])) return false;
    }

    int monthTens = normalized[9]  - L'0';
    int monthOnes = normalized[10] - L'0';
    int month = monthTens * 10 + monthOnes;
    if (month < 1 || month > 12) return false;

    // Year is 2 digits (00-99). You can tighten this if you want (e.g., >= 20).
    // int year = (normalized[11]-L'0')*10 + (normalized[12]-L'0');

    return true;
}

// Extract best candidate from match text.
// Handles OCR like: "UMA 545664 01 24" by merging tokens.
static bool ExtractUmaCandidate(const std::wstring& matchText, std::wstring& outNormalized)
{
    std::wstring upper = ToUpperCopy(matchText);
    std::vector<std::wstring> toks = TokenizeAlnumUpper(upper);

    // Try merging up to 4 adjacent tokens to reconstruct the ID.
    // (UMA) + (digits...) often splits into multiple tokens.
    for (size_t i = 0; i < toks.size(); ++i)
    {
        std::wstring merged;

        for (size_t k = 0; k < 4 && (i + k) < toks.size(); ++k)
        {
            merged += toks[i + k];
            std::wstring norm = NormalizeAlnumUpper(merged);

            // We can early-skip if it doesn't start with UMA and is already length >=3
            if (norm.size() >= 3 && !StartsWith(norm, L"UMA"))
                break;

            // Only validate when length is exactly 13; anything longer should stop.
            if (norm.size() == 13)
            {
                if (IsValidUmaStudentId(norm))
                {
                    outNormalized = norm;
                    return true;
                }
                // If length == 13 but invalid, keep trying other starts, but don't extend further.
                break;
            }
            if (norm.size() > 13)
                break;
        }
    }

    // Fallback: sometimes the whole match is already near-clean.
    std::wstring normAll = NormalizeAlnumUpper(upper);
    if (normAll.size() >= 13)
    {
        // Scan within normalized text for a valid 13-char UMA ID
        for (size_t pos = 0; pos + 13 <= normAll.size(); ++pos)
        {
            if (normAll[pos] == L'U' && pos + 3 <= normAll.size() && normAll.compare(pos, 3, L"UMA") == 0)
            {
                std::wstring cand = normAll.substr(pos, 13);
                if (IsValidUmaStudentId(cand))
                {
                    outNormalized = cand;
                    return true;
                }
            }
        }
    }

    return false;
}

// Central validation used by DoTest / DoTestEx
static bool ValidateMatch(const std::wstring& matchText)
{
    std::wstring norm;
    if (!ExtractUmaCandidate(matchText, norm))
        return false;

    return IsValidUmaStudentId(norm);
}

// ---------------- Exported functions ----------------
extern "C" __declspec(dllexport)
const TCHAR* GetDisplayName(void)
{
    return CUSTOM_SEARCH_NAME;
}

extern "C" __declspec(dllexport)
unsigned int GetResultType(void)
{
    return RESULT_TYPE;
}

extern "C" __declspec(dllexport)
void GetSearchItemData(SearchItemData*& pData)
{
    pData = new SearchItemData();
    pData->searchInfo.displayName = CUSTOM_SEARCH_NAME;
    pData->resultType = RESULT_TYPE;

    // Regex: UMA + 10 digits, allowing optional whitespace between digits.
    // Matches: UMA5456640124, UMA 545664 01 24, UMA 5 4 5 6 6 4 0 1 2 4
    pData->data = _T("(?i)(?<![A-Z0-9])UMA(?:\\s*\\d){10}(?![A-Z0-9])");
    pData->dataType = 1; // regex
}

extern "C" __declspec(dllexport)
void DeleteSearchItemData(SearchItemData* pData)
{
    if (pData)
    {
        delete pData;
        pData = nullptr;
    }
}

// The engine may call DoTest (no location) or DoTestEx (with location + file data)
extern "C" __declspec(dllexport)
bool DoTest(const std::wstring& x, const std::wstring* /*fileDataPtr*/)
{
    return ValidateMatch(x);
}

extern "C" __declspec(dllexport)
bool DoTestEx(const std::wstring& x, const std::wstring* /*fileDataPtr*/, std::wstring::size_type /*location*/)
{
    return ValidateMatch(x);
}

extern "C" __declspec(dllexport)
bool Clean(const std::wstring& x, std::wstring*& result)
{
    result = nullptr;
    try
    {
        std::wstring norm;
        if (!ExtractUmaCandidate(x, norm))
            return false;

        // Canonical value stored: UMA + 10 digits (no spaces)
        result = new std::wstring(norm);
        return true;
    }
    catch (...)
    {
        return false;
    }
}

extern "C" __declspec(dllexport)
bool FreeCleanedResult(std::wstring*& result)
{
    try
    {
        if (result)
        {
            delete result;
            result = nullptr;
        }
        return true;
    }
    catch (...)
    {
        return false;
    }
}

Test strings (quick QA list)

Should match

UMA5456640124
uma5456640124
UMA 545664 01 24
Student ID: UMA5456640124
UMA 5 4 5 6 6 4 0 1 2 4

Should NOT match

UMA5456641324 (month 13 invalid)
UMA5456640024 (month 00 invalid)
UMA545660124 (too short)
XUMA5456640124 (blocked by boundary)
UMA54566401245 (too long)

SearchAPI Example: Search for Student IDs

Stage 1 - RegEx (GetSearchItemData)

Stage 2 - C++ Strictly Validates and Normalizes

Example SearchDLL.cpp - Used to Discover UMA Student IDs

Test strings (quick QA list)

Was this article helpful?