C#からIndex Serviceを使って”分かち書き(わかちがき)”する | .NET Framework 2.0 と C#での開発ブログ

C#からIndex Serviceを使って”分かち書き(わかちがき)”する

先の記事(C#からIMEを使って逆検索でかな変換する )でMS-IMEを使ってかな変換をする話を書きましたが、その際に分かち書きが出来ないのでふりがな付けられないってことを書きましたが、実は”分かち書き(わかちがき)”もWindowsに用意されてる”Index Service”を利用すれば一応出来ます。(OSバージョンは不明)
でも私がちょっと試してみた限り、、、精度が低すぎてだめでした。
これはもしかしたらアプリ側の設定とかで調整すればなんとかなるのかなぁ~と淡い期待を持っているので今度調べようと思ってます。

ちなみにC#からIndex Serviceを使うサンプルソースが外人さんのブログにあったのでそれを参考に日本語で分かち書きするサンプルソースを下記に残します。

(情報源:http://sqljunkies.com/WebLog/acencini/articles/595.aspx)
-----------------------------------------------------------------------------------------
[WordBreaker.cs]
//===============================================================
// WordBreaker.cs
//===============================================================
using System;
using System.Runtime.InteropServices;

namespace StemText
{
//===============================================================
// Wordbreaker stuff
//===============================================================
[Flags]
public enum WORDREP_BREAK_TYPE
{
WORDREP_BREAK_EOW = 0,
WORDREP_BREAK_EOS = 1,
WORDREP_BREAK_EOP = 2,
WORDREP_BREAK_EOC = 3
}

[ComImport]
[Guid("CC907054-C058-101A-B554-08002B33B0E6")]
[InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
public interface IWordSink
{
void PutWord([MarshalAs(UnmanagedType.U4)] int cwc,
[MarshalAs(UnmanagedType.LPWStr)] string pwcInBuf,
[MarshalAs(UnmanagedType.U4)] int cwcSrcLen,
[MarshalAs(UnmanagedType.U4)] int cwcSrcPos);
void PutAltWord([MarshalAs(UnmanagedType.U4)] int cwc,
[MarshalAs(UnmanagedType.LPWStr)] string pwcInBuf,
[MarshalAs(UnmanagedType.U4)] int cwcSrcLen,
[MarshalAs(UnmanagedType.U4)] int cwcSrcPos);
void StartAltPhrase();
void EndAltPhrase();
void PutBreak(WORDREP_BREAK_TYPE breakType);
}

[ComImport]
[Guid("CC906FF0-C058-101A-B554-08002B33B0E6")]
[InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
public interface IPhraseSink
{
void PutSmallPhrase([MarshalAs(UnmanagedType.LPWStr)] string pwcNoun,
[MarshalAs(UnmanagedType.U4)] int cwcNoun,
[MarshalAs(UnmanagedType.LPWStr)] string pwcModifier,
[MarshalAs(UnmanagedType.U4)] int cwcModifier,
[MarshalAs(UnmanagedType.U4)] int ulAttachmentType);
void PutPhrase([MarshalAs(UnmanagedType.LPWStr)] string pwcPhrase,
[MarshalAs(UnmanagedType.U4)] int cwcPhrase);
}

public class CWordSink: IWordSink
{
public void PutWord(int cwc, string pwcInBuf, int cwcSrcLen, int cwcSrcPos)
{
Console.WriteLine("PutWord buffer: " + pwcInBuf.Substring(0, cwc));
}

public void PutAltWord(int cwc, string pwcInBuf, int cwcSrcLen, int cwcSrcPos)
{
Console.WriteLine("PutAltWord buffer: " + pwcInBuf.Substring(0, cwc));
}

public void StartAltPhrase()
{
Console.WriteLine("StartAltPhrase");
}

public void EndAltPhrase()
{
Console.WriteLine("EndAltPhrase");
}

public void PutBreak(StemText.WORDREP_BREAK_TYPE breakType)
{
string strBreak;
switch (breakType)
{
case WORDREP_BREAK_TYPE.WORDREP_BREAK_EOC :
strBreak = "EOC";
break;
case WORDREP_BREAK_TYPE.WORDREP_BREAK_EOP :
strBreak = "EOP";
break;
case WORDREP_BREAK_TYPE.WORDREP_BREAK_EOS :
strBreak = "EOS";
break;
case WORDREP_BREAK_TYPE.WORDREP_BREAK_EOW :
strBreak = "EOW";
break;
default :
strBreak = "ERROR";
break;
}
Console.WriteLine("PutBreak : " + strBreak);
}
}

public class CPhraseSink: IPhraseSink
{
public void PutSmallPhrase(string pwcNoun, int cwcNoun, string pwcModifier, int cwcModifier, int ulAttachmentType)
{
Console.WriteLine("PutSmallPhrase: " + pwcNoun.Substring(0, cwcNoun)
+ " , " + pwcModifier.Substring(0, cwcModifier));
}

public void PutPhrase(string pwcPhrase, int cwcPhrase)
{
Console.WriteLine("PutPhrase: " + pwcPhrase.Substring(0, cwcPhrase));
}
}

[StructLayout(LayoutKind.Sequential)]
public struct TEXT_SOURCE
{
[MarshalAs(UnmanagedType.FunctionPtr)] public delFillTextBuffer pfnFillTextBuffer;
[MarshalAs(UnmanagedType.LPWStr)] public string awcBuffer;
[MarshalAs(UnmanagedType.U4)] public int iEnd;
[MarshalAs(UnmanagedType.U4)] public int iCur;
}

// used to fill the buffer for TEXT_SOURCE
public delegate uint delFillTextBuffer([MarshalAs(UnmanagedType.Struct)] ref TEXT_SOURCE pTextSource);

[ComImport]
[Guid("D53552C8-77E3-101A-B552-08002B33B0E6")]
[InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
public interface IWordBreaker
{
void Init([MarshalAs(UnmanagedType.Bool)] bool fQuery,
[MarshalAs(UnmanagedType.U4)] int maxTokenSize,
[MarshalAs(UnmanagedType.Bool)] out bool pfLicense);
void BreakText([MarshalAs(UnmanagedType.Struct)] ref TEXT_SOURCE pTextSource,
[MarshalAs(UnmanagedType.Interface)] IWordSink pWordSink,
[MarshalAs(UnmanagedType.Interface)] IPhraseSink pPhraseSink);
void GetLicenseToUse([MarshalAs(UnmanagedType.LPWStr)] out string ppwcsLicense);
}

[ComImport]
[Guid("BE41F4E6-9EAD-498f-A473-F3CA66F9BE8B")]
public class CWordBreaker
{
}
}

上記のクラスを用意してあとはテキストボックス1と2を用意してボタンを押すと1の文字を分かち書きして2に表示するテストをしました。 

private void button1_Click(object sender, EventArgs e)
{
CWordBreaker wb = new CWordBreaker();
IWordBreaker iwb = (IWordBreaker)wb;

CWordSink cws = new CWordSink();
IWordSink iws = (IWordSink)cws;
CPhraseSink cps = new CPhraseSink();
IPhraseSink ips = (IPhraseSink)cps;

bool pfLicense = true;
iwb.Init(true, 1000, out pfLicense);

string tokStr = textBox1.Text;

TEXT_SOURCE pTextSource = new TEXT_SOURCE();
pTextSource.pfnFillTextBuffer = new delFillTextBuffer(pfnFillTextBuffer);
pTextSource.awcBuffer = tokStr;
pTextSource.iCur = 0;
pTextSource.iEnd = tokStr.Length;

iwb.BreakText(ref pTextSource, iws, ips);

textBox2.Text = pTextSource.ToString();
}
-----------------------------------------------------------------------------------------

こんな感じでテストは動きましたとさ。