Support char arrays

This commit is contained in:
Alexander Luzgarev 2019-03-09 17:00:13 +01:00
parent 0e14434bae
commit 93be86d526
10 changed files with 378 additions and 87 deletions

View File

@ -0,0 +1,61 @@
using NUnit.Framework;
using System.IO;
namespace MatFileHandler.Tests
{
[TestFixture]
public class MatFileReaderHdfTests
{
private const string TestDirectory = "test-data";
/// <summary>
/// Test reading an ASCII-encoded string.
/// </summary>
[Test]
public void TestAscii()
{
var matFile = ReadHdfTestFile("ascii");
var arrayAscii = matFile["s"].Value as ICharArray;
Assert.That(arrayAscii, Is.Not.Null);
Assert.That(arrayAscii.Dimensions, Is.EqualTo(new[] { 1, 3 }));
Assert.That(arrayAscii.String, Is.EqualTo("abc"));
Assert.That(arrayAscii[2], Is.EqualTo('c'));
}
/// <summary>
/// Test reading a Unicode string.
/// </summary>
[Test]
public void TestUnicode()
{
var matFile = ReadHdfTestFile("unicode");
var arrayUnicode = matFile["s"].Value as ICharArray;
Assert.That(arrayUnicode, Is.Not.Null);
Assert.That(arrayUnicode.Dimensions, Is.EqualTo(new[] { 1, 2 }));
Assert.That(arrayUnicode.String, Is.EqualTo("必フ"));
Assert.That(arrayUnicode[0], Is.EqualTo('必'));
Assert.That(arrayUnicode[1], Is.EqualTo('フ'));
}
/// <summary>
/// Test reading a wide Unicode string.
/// </summary>
[Test]
public void TestUnicodeWide()
{
var matFile = ReadHdfTestFile("unicode-wide");
var arrayUnicodeWide = matFile["s"].Value as ICharArray;
Assert.That(arrayUnicodeWide, Is.Not.Null);
Assert.That(arrayUnicodeWide.Dimensions, Is.EqualTo(new[] { 1, 2 }));
Assert.That(arrayUnicodeWide.String, Is.EqualTo("🍆"));
}
private static AbstractTestDataFactory<IMatFile> GetTests(string factoryName) =>
new MatTestDataFactory(Path.Combine(TestDirectory, factoryName));
private IMatFile ReadHdfTestFile(string testName)
{
return GetTests("hdf")[testName];
}
}
}

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,156 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Numerics;
using System.Runtime.InteropServices;
using System.Text;
using HDF.PInvoke;
namespace MatFileHandler
{
public class HdfCharArray : ICharArray
{
public HdfCharArray(int[] dimensions, string data)
{
Dimensions = dimensions;
StringData = data;
}
public bool IsEmpty => Dimensions.Length == 0;
public int[] Dimensions { get; }
public int Count => Dimensions.NumberOfElements();
public double[] ConvertToDoubleArray()
{
return Data.Select(Convert.ToDouble).ToArray();
}
public Complex[] ConvertToComplexArray()
{
return ConvertToDoubleArray().Select(x => new Complex(x, 0.0)).ToArray();
}
public char[] Data => StringData.ToCharArray();
public char this[params int[] list]
{
get => StringData[Dimensions.DimFlatten(list)];
set {
var chars = StringData.ToCharArray();
chars[Dimensions.DimFlatten(list)] = value;
StringData = chars.ToString();
}
}
public string String => StringData;
private string StringData { get; set; }
}
internal class HdfFileReader
{
private long fileId;
private List<IVariable> variables;
internal HdfFileReader(long fileId)
{
this.fileId = fileId;
}
internal IMatFile Read()
{
variables = new List<IVariable>();
H5G.info_t group_info = default(H5G.info_t);
var result = H5G.get_info(fileId, ref group_info);
var numberOfVariables = group_info.nlinks;
ulong idx = 0;
while (idx < numberOfVariables)
{
H5L.iterate(
fileId,
H5.index_t.NAME,
H5.iter_order_t.NATIVE,
ref idx,
VariableIterator,
IntPtr.Zero);
}
return new MatFile(variables);
}
private int VariableIterator(long group, IntPtr name, ref H5L.info_t info, IntPtr op_data)
{
var variableName = Marshal.PtrToStringAnsi(name);
var object_info = default(H5O.info_t);
H5O.get_info_by_name(group, variableName, ref object_info);
switch (object_info.type)
{
case H5O.type_t.DATASET:
var datasetId = H5D.open(group, variableName);
var value = ReadDataset(datasetId);
variables.Add(new MatVariable(value, variableName, false));
break;
case H5O.type_t.GROUP:
throw new NotImplementedException();
}
return 0;
}
private static string GetMatlabClassOfDataset(long datasetId)
{
var attributeId = H5A.open_by_name(datasetId, ".", "MATLAB_class");
var typeId = H5A.get_type(attributeId);
var cl = H5T.get_class(typeId);
if (cl != H5T.class_t.STRING)
{
throw new NotImplementedException();
}
var classId = H5T.copy(H5T.C_S1);
var typeIdSize = H5T.get_size(typeId);
H5T.set_size(classId, typeIdSize);
var buf = Marshal.AllocHGlobal(typeIdSize);
H5A.read(attributeId, classId, buf);
var matlabClassNameBytes = new byte[(int)typeIdSize];
Marshal.Copy(buf, matlabClassNameBytes, 0, (int)typeIdSize);
return Encoding.ASCII.GetString(matlabClassNameBytes);
}
private static int[] GetDimensionsOfDataset(long datasetId)
{
var spaceId = H5D.get_space(datasetId);
var rank = H5S.get_simple_extent_ndims(spaceId);
var dims = new ulong[rank];
H5S.get_simple_extent_dims(spaceId, dims, null);
Array.Reverse(dims);
return dims.Select(x => (int)x).ToArray();
}
private static IArray ReadDataset(long datasetId)
{
var dims = GetDimensionsOfDataset(datasetId);
var matlabClass = GetMatlabClassOfDataset(datasetId);
if (matlabClass == "char")
{
return ReadCharArray(datasetId, dims);
}
throw new NotImplementedException();
}
private static IArray ReadCharArray(long datasetId, int[] dims)
{
var storageSize = (int)H5D.get_storage_size(datasetId);
var data = new byte[storageSize];
var dataBuffer = Marshal.AllocHGlobal(storageSize);
H5D.read(datasetId, H5T.NATIVE_UINT16, H5S.ALL, H5S.ALL, H5P.DEFAULT, dataBuffer);
Marshal.Copy(dataBuffer, data, 0, storageSize);
var str = Encoding.Unicode.GetString(data);
return new HdfCharArray(dims, str);
}
}
}

View File

@ -13,13 +13,16 @@ namespace MatFileHandler
/// </summary> /// </summary>
internal class Header internal class Header
{ {
private Header(string text, long subsystemDataOffset, int version) private Header(byte[] rawBytes, string text, long subsystemDataOffset, int version)
{ {
RawBytes = rawBytes;
Text = text; Text = text;
SubsystemDataOffset = subsystemDataOffset; SubsystemDataOffset = subsystemDataOffset;
Version = version; Version = version;
} }
public byte[] RawBytes { get; }
/// <summary> /// <summary>
/// Gets the header text. /// Gets the header text.
/// </summary> /// </summary>
@ -55,7 +58,7 @@ namespace MatFileHandler
platform = platform.Remove(length); platform = platform.Remove(length);
} }
var text = $"MATLAB 5.0 MAT-file, Platform: {platform}, Created on: {dateTime}{padding}"; var text = $"MATLAB 5.0 MAT-file, Platform: {platform}, Created on: {dateTime}{padding}";
return new Header(text, 0, 256); return new Header(null, text, 0, 256);
} }
/// <summary> /// <summary>
@ -65,18 +68,26 @@ namespace MatFileHandler
/// <returns>The header read.</returns> /// <returns>The header read.</returns>
public static Header Read(BinaryReader reader) public static Header Read(BinaryReader reader)
{ {
var textBytes = reader.ReadBytes(116); var rawBytes = reader.ReadBytes(128);
var text = System.Text.Encoding.UTF8.GetString(textBytes); using (var stream = new MemoryStream(rawBytes))
var subsystemDataOffsetBytes = reader.ReadBytes(8);
var subsystemDataOffset = BitConverter.ToInt64(subsystemDataOffsetBytes, 0);
var version = reader.ReadInt16();
var endian = reader.ReadInt16();
var isLittleEndian = endian == 19785;
if (!isLittleEndian)
{ {
throw new NotSupportedException("Big-endian files are not supported."); using (var newReader = new BinaryReader(stream))
{
var textBytes = newReader.ReadBytes(116);
var text = System.Text.Encoding.UTF8.GetString(textBytes);
var subsystemDataOffsetBytes = newReader.ReadBytes(8);
var subsystemDataOffset = BitConverter.ToInt64(subsystemDataOffsetBytes, 0);
var version = newReader.ReadInt16();
var endian = newReader.ReadInt16();
var isLittleEndian = endian == 19785;
if (!isLittleEndian)
{
throw new NotSupportedException("Big-endian files are not supported.");
}
return new Header(rawBytes, text, subsystemDataOffset, version);
}
} }
return new Header(text, subsystemDataOffset, version);
} }
private static string GetOperatingSystem() private static string GetOperatingSystem()

View File

@ -0,0 +1,41 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Runtime.InteropServices;
using System.Text;
using HDF.PInvoke;
namespace MatFileHandler
{
internal static class MatFileHdfReader
{
internal static IMatFile ContinueReadingHdfFile(Header header, Stream stream)
{
using (var memoryStream = new MemoryStream())
{
using (var headerStream = new MemoryStream(header.RawBytes))
{
headerStream.CopyTo(memoryStream);
}
stream.CopyTo(memoryStream);
var bytes = memoryStream.ToArray();
return ReadFromByteArray(bytes);
}
}
private static IMatFile ReadFromByteArray(byte[] bytes)
{
var fileAccessPropertyList = H5P.create(H5P.FILE_ACCESS);
H5P.set_fapl_core(fileAccessPropertyList, IntPtr.Add(IntPtr.Zero, 1024), 0);
var ptr = Marshal.AllocCoTaskMem(bytes.Length);
Marshal.Copy(bytes, 0, ptr, bytes.Length);
H5P.set_file_image(fileAccessPropertyList, ptr, IntPtr.Add(IntPtr.Zero, bytes.Length));
var fileId = H5F.open(Guid.NewGuid().ToString(), H5F.ACC_RDONLY, fileAccessPropertyList);
var hdfFileReader = new HdfFileReader(fileId);
var result = hdfFileReader.Read();
H5F.close(fileId);
H5F.clear_elink_file_cache(fileId);
return result;
}
}
}

View File

@ -0,0 +1,88 @@
using System.Collections.Generic;
using System.IO;
namespace MatFileHandler
{
internal static class MatFileLevel5Reader
{
/// <summary>
/// Read a sequence of raw variables from .mat file.
/// </summary>
/// <param name="reader">Reader.</param>
/// <param name="subsystemDataOffset">Offset of subsystem data in the file;
/// we need it because we may encounter it during reading, and
/// the subsystem data should be parsed in a special way.</param>
/// <param name="subsystemData">
/// Link to the current file's subsystem data structure; initially it has dummy value
/// which will be replaced after we parse the whole subsystem data.</param>
/// <returns>List of "raw" variables; the actual variables are constructed from them later.</returns>
internal static List<RawVariable> ReadRawVariables(BinaryReader reader, long subsystemDataOffset, SubsystemData subsystemData)
{
var variables = new List<RawVariable>();
var dataElementReader = new DataElementReader(subsystemData);
while (true)
{
try
{
var position = reader.BaseStream.Position;
var dataElement = dataElementReader.Read(reader);
if (position == subsystemDataOffset)
{
var subsystemDataElement = dataElement as IArrayOf<byte>;
var newSubsystemData = ReadSubsystemData(subsystemDataElement.Data, subsystemData);
subsystemData.Set(newSubsystemData);
}
else
{
variables.Add(new RawVariable(position, dataElement));
}
}
catch (EndOfStreamException)
{
break;
}
}
return variables;
}
/// <summary>
/// Read raw variables from a .mat file.
/// </summary>
/// <param name="reader">Binary reader.</param>
/// <param name="subsystemDataOffset">Offset to the subsystem data to use (read from the file header).</param>
/// <returns>Raw variables read.</returns>
internal static List<RawVariable> ReadRawVariables(BinaryReader reader, long subsystemDataOffset)
{
var subsystemData = new SubsystemData();
return ReadRawVariables(reader, subsystemDataOffset, subsystemData);
}
internal static IMatFile ContinueReadingLevel5File(Header header, BinaryReader reader)
{
var rawVariables = ReadRawVariables(reader, header.SubsystemDataOffset);
var variables = new List<IVariable>();
foreach (var variable in rawVariables)
{
var array = variable.DataElement as MatArray;
if (array is null)
{
continue;
}
variables.Add(new MatVariable(
array,
array.Name,
array.Flags.Variable.HasFlag(Variable.IsGlobal)));
}
return new MatFile(variables);
}
private static SubsystemData ReadSubsystemData(byte[] bytes, SubsystemData subsystemData)
{
return SubsystemDataReader.Read(bytes, subsystemData);
}
}
}

View File

@ -34,89 +34,23 @@ namespace MatFileHandler
} }
} }
/// <summary> private IMatFile Read(BinaryReader reader)
/// Read a sequence of raw variables from .mat file.
/// </summary>
/// <param name="reader">Reader.</param>
/// <param name="subsystemDataOffset">Offset of subsystem data in the file;
/// we need it because we may encounter it during reading, and
/// the subsystem data should be parsed in a special way.</param>
/// <param name="subsystemData">
/// Link to the current file's subsystem data structure; initially it has dummy value
/// which will be replaced after we parse the whole subsystem data.</param>
/// <returns>List of "raw" variables; the actual variables are constructed from them later.</returns>
internal static List<RawVariable> ReadRawVariables(BinaryReader reader, long subsystemDataOffset, SubsystemData subsystemData)
{
var variables = new List<RawVariable>();
var dataElementReader = new DataElementReader(subsystemData);
while (true)
{
try
{
var position = reader.BaseStream.Position;
var dataElement = dataElementReader.Read(reader);
if (position == subsystemDataOffset)
{
var subsystemDataElement = dataElement as IArrayOf<byte>;
var newSubsystemData = ReadSubsystemData(subsystemDataElement.Data, subsystemData);
subsystemData.Set(newSubsystemData);
}
else
{
variables.Add(new RawVariable(position, dataElement));
}
}
catch (EndOfStreamException)
{
break;
}
}
return variables;
}
/// <summary>
/// Read raw variables from a .mat file.
/// </summary>
/// <param name="reader">Binary reader.</param>
/// <param name="subsystemDataOffset">Offset to the subsystem data to use (read from the file header).</param>
/// <returns>Raw variables read.</returns>
internal static List<RawVariable> ReadRawVariables(BinaryReader reader, long subsystemDataOffset)
{
var subsystemData = new SubsystemData();
return ReadRawVariables(reader, subsystemDataOffset, subsystemData);
}
private static IMatFile Read(BinaryReader reader)
{ {
var header = ReadHeader(reader); var header = ReadHeader(reader);
var rawVariables = ReadRawVariables(reader, header.SubsystemDataOffset); switch (header.Version)
var variables = new List<IVariable>();
foreach (var variable in rawVariables)
{ {
var array = variable.DataElement as MatArray; case 256:
if (array is null) return MatFileLevel5Reader.ContinueReadingLevel5File(header, reader);
{ case 512:
continue; return MatFileHdfReader.ContinueReadingHdfFile(header, reader.BaseStream);
} default:
throw new NotSupportedException($"Unknown file format.");
variables.Add(new MatVariable(
array,
array.Name,
array.Flags.Variable.HasFlag(Variable.IsGlobal)));
} }
return new MatFile(variables);
} }
private static Header ReadHeader(BinaryReader reader) private static Header ReadHeader(BinaryReader reader)
{ {
return Header.Read(reader); return Header.Read(reader);
} }
private static SubsystemData ReadSubsystemData(byte[] bytes, SubsystemData subsystemData)
{
return SubsystemDataReader.Read(bytes, subsystemData);
}
} }
} }

View File

@ -29,7 +29,7 @@ namespace MatFileHandler
using (var reader = new BinaryReader(stream)) using (var reader = new BinaryReader(stream))
{ {
reader.ReadBytes(8); reader.ReadBytes(8);
rawVariables = MatFileReader.ReadRawVariables(reader, -1, subsystemData); rawVariables = MatFileLevel5Reader.ReadRawVariables(reader, -1, subsystemData);
} }
} }