The following program demonstrates how to compare and create sort keys with default locale.
In C:
#include <stdio.h>
#include <memory.h>
#include <string.h>
#include "unicode/ustring.h"
#include "unicode/utypes.h"
#include "unicode/uloc.h"
#include "unicode/ucol.h"
#define MAXBUFFERSIZE 100
#define BIGBUFFERSIZE 5000
UBoolcollateWithLocaleInC(constchar*locale,UErrorCode*status){UChardispName[MAXBUFFERSIZE];int32_tbufferLen=0;UCharsource[MAXBUFFERSIZE];UChartarget[MAXBUFFERSIZE];UCollationResultresult=UCOL_EQUAL;uint8_tsourceKeyArray[MAXBUFFERSIZE];uint8_ttargetKeyArray[MAXBUFFERSIZE];int32_tsourceKeyOut=0,targetKeyOut=0;UCollator*myCollator=0;if(U_FAILURE(*status)){returnfalse;}u_uastrcpy(source,"This is a test.");u_uastrcpy(target,"THIS IS A TEST.");myCollator=ucol_open(locale,status);if(U_FAILURE(*status)){bufferLen=uloc_getDisplayName(locale,0,dispName,MAXBUFFERSIZE,status);/*Report the error with display name... */fprintf(stderr,"Failed to create the collator for : \"%s\"\n",dispName);returnfalse;}result=ucol_strcoll(myCollator,source,u_strlen(source),target,u_strlen(target));/* result is 1, secondary differences only for ignorable space characters*/if(result!=UCOL_LESS){fprintf(stderr,"Comparing two strings with only secondary differences in C failed.\n");returnfalse;}/* To compare them with just primary differences */ucol_setStrength(myCollator,UCOL_PRIMARY);result=ucol_strcoll(myCollator,source,u_strlen(source),target,u_strlen(target));/* result is 0 */if(result!=0){fprintf(stderr,"Comparing two strings with no differences in C failed.\n");returnfalse;}/* Now, do the same comparison with keys */sourceKeyOut=ucol_getSortKey(myCollator,source,-1,sourceKeyArray,MAXBUFFERSIZE);targetKeyOut=ucol_getSortKey(myCollator,target,-1,targetKeyArray,MAXBUFFERSIZE);result=0;result=strcmp(sourceKeyArray,targetKeyArray);if(result!=0){fprintf(stderr,"Comparing two strings with sort keys in C failed.\n");returnfalse;}ucol_close(myCollator);returntrue;}
In C++:
#include <stdio.h>
#include "unicode/unistr.h"
#include "unicode/utypes.h"
#include "unicode/locid.h"
#include "unicode/coll.h"
#include "unicode/tblcoll.h"
#include "unicode/coleitr.h"
#include "unicode/sortkey.h"
UBoolcollateWithLocaleInCPP(constLocale&locale,UErrorCode&status){UnicodeStringdispName;UnicodeStringsource("This is a test.");UnicodeStringtarget("THIS IS A TEST.");Collator::EComparisonResultresult=Collator::EQUAL;CollationKeysourceKey;CollationKeytargetKey;Collator*myCollator=0;if(U_FAILURE(status)){returnfalse;}myCollator=Collator::createInstance(locale,status);if(U_FAILURE(status)){locale.getDisplayName(dispName);/*Report the error with display name... */fprintf(stderr,"%s: Failed to create the collator for : \"%s\"\n",dispName);returnfalse;}result=myCollator->compare(source,target);/* result is 1, secondary differences only for ignorable space characters*/if(result!=UCOL_LESS){fprintf(stderr,"Comparing two strings with only secondary differences in C failed.\n");returnfalse;}/* To compare them with just primary differences */myCollator->setStrength(Collator::PRIMARY);result=myCollator->compare(source,target);/* result is 0 */if(result!=0){fprintf(stderr,"Comparing two strings with no differences in C failed.\n");returnfalse;}/* Now, do the same comparison with keys */myCollator->getCollationKey(source,sourceKey,status);myCollator->getCollationKey(target,targetKey,status);result=Collator::EQUAL;result=sourceKey.compareTo(targetKey);if(result!=0){fprintf(stderr,"%s: Comparing two strings with sort keys in C failed.\n");returnfalse;}deletemyCollator;returntrue;}
Main Function
extern"C"UBoolcollateWithLocaleInC(constchar*locale,UErrorCode*status);intmain(){UErrorCodestatus=U_ZERO_ERROR;fprintf(stdout,"\n");if(collateWithLocaleInCPP(Locale("en","US"),status)!=true){fprintf(stderr,"Collate with locale in C++ failed.\n");}else{fprintf(stdout,"Collate with Locale C++ example worked!!\n");}status=U_ZERO_ERROR;fprintf(stdout,"\n");if(collateWithLocaleInC("en_US",&status)!=true){fprintf(stderr,"%s: Collate with locale in C failed.\n");}else{fprintf(stdout,"Collate with Locale C example worked!!\n");}return0;}
In Java:
importcom.ibm.icu.text.Collator;importcom.ibm.icu.text.CollationElementIterator;importcom.ibm.icu.text.CollationKey;importjava.util.Locale;publicclassCollateExample{publicstaticvoidmain(Stringarg[]){CollateExampleexample=newCollateExample();try{if(!example.collateWithLocale(Locale.US)){System.err.println("Collate with locale example failed.");}else{System.out.println("Collate with Locale example worked!!");}}catch(Exceptione){System.err.println("Collating with locale failed");e.printStackTrace();}}publicbooleancollateWithLocale(Localelocale)throwsException{Stringsource="This is a test.";Stringtarget="THIS IS A TEST.";CollatormyCollator=Collator.getInstance(locale);intresult=myCollator.compare(source,target);// result is 1, secondary differences only for ignorable space charactersif(result>=0){System.err.println("Comparing two strings with only secondary differences failed.");returnfalse;}// To compare them with just primary differencesmyCollator.setStrength(Collator.PRIMARY);result=myCollator.compare(source,target);// result is 0if(result!=0){System.err.println("Comparing two strings with no differences failed.");returnfalse;}// Now, do the same comparison with keysCollationKeysourceKey=myCollator.getCollationKey(source);CollationKeytargetKey=myCollator.getCollationKey(target);result=sourceKey.compareTo(targetKey);if(result!=0){System.err.println("Comparing two strings with sort keys failed.");returnfalse;}returntrue;}}
Language-sensitive searching
String searching is a well-researched area, and there are algorithms that can optimize the searching process. Perhaps the best is the Boyer-Moore method. For a full description of this concept, please see Laura Werner’s text searching article for more details (https://icu-project.org/docs/papers/efficient_text_searching_in_java.html).
However, implementing collation-based search with the Boyer-Moore method while getting correct results is very tricky, and ICU no longer uses this method (as of ICU4C 4.0 and ICU4J 53).
A good solution for the problem of not knowing the sort key size in advance is to allocate a large buffer and store all the sort keys there, while keeping a list of indexes or pointers to that buffer.
Following is sample code that will take a pointer to an array of UChar pointer, an array of key indexes. It will allocate and fill a buffer with sort keys and return the maximum size for a sort key. Once you have done this to your string, you just need to allocate a field of maximum size and copy your sortkeys from the buffer to fields.
uint32_tfillBufferWithKeys(UCollator*coll,UChar**source,uint32_t*keys,uint32_tsourceSize,uint8_t**buffer,uint32_t*maxSize,UErrorCode*status){if(status==NULL||U_FAILURE(*status)){return0;}uint32_tbufferSize=16384;uint32_tincrement=16384;uint32_tcurrentOffset=0;uint32_tkeySize=0;uint32_ti=0;*maxSize=0;*buffer=(uint8_t*)malloc(bufferSize*sizeof(uint8_t));if(buffer==NULL){*status=U_MEMORY_ALLOCATION_ERROR;return0;}for(i=0;i<sourceSize;i++){keys[i]=currentOffset;keySize=ucol_getSortKey(coll,source[i],-1,*buffer+currentOffset,bufferSize-currentOffset);if(keySize>bufferSize-currentOffset){*buffer=(uint8_t*)realloc(*buffer,bufferSize+increment);if(buffer==NULL){*status=U_MEMORY_ALLOCATION_ERROR;return0;}bufferSize+=increment;keySize=ucol_getSortKey(coll,source[i],-1,*buffer+currentOffset,bufferSize-currentOffset);}/* here you can hook code that does something interesting with the keySize -
* remembers the maximum or similar...
*/if(keySize>*maxSize){*maxSize=keySize;}currentOffset+=keySize;}returncurrentOffset;}