From 93be86d526a10a682ec75d90fea6ff4d99934a5e Mon Sep 17 00:00:00 2001 From: Alexander Luzgarev Date: Sat, 9 Mar 2019 17:00:13 +0100 Subject: [PATCH] Support char arrays --- MatFileHandler.Tests/MatFileReaderHdfTests.cs | 61 +++++++ MatFileHandler.Tests/test-data/hdf/ascii.mat | Bin 0 -> 1920 bytes .../test-data/hdf/unicode-wide.mat | Bin 0 -> 1920 bytes .../test-data/hdf/unicode.mat | Bin 0 -> 1920 bytes MatFileHandler/HdfFileReader.cs | 156 ++++++++++++++++++ MatFileHandler/Header.cs | 35 ++-- MatFileHandler/MatFileHdfReader.cs | 41 +++++ MatFileHandler/MatFileLevel5Reader.cs | 88 ++++++++++ MatFileHandler/MatFileReader.cs | 82 +-------- MatFileHandler/SubsystemDataReader.cs | 2 +- 10 files changed, 378 insertions(+), 87 deletions(-) create mode 100644 MatFileHandler.Tests/MatFileReaderHdfTests.cs create mode 100644 MatFileHandler.Tests/test-data/hdf/ascii.mat create mode 100644 MatFileHandler.Tests/test-data/hdf/unicode-wide.mat create mode 100644 MatFileHandler.Tests/test-data/hdf/unicode.mat create mode 100644 MatFileHandler/HdfFileReader.cs create mode 100644 MatFileHandler/MatFileHdfReader.cs create mode 100644 MatFileHandler/MatFileLevel5Reader.cs diff --git a/MatFileHandler.Tests/MatFileReaderHdfTests.cs b/MatFileHandler.Tests/MatFileReaderHdfTests.cs new file mode 100644 index 0000000..cdf2085 --- /dev/null +++ b/MatFileHandler.Tests/MatFileReaderHdfTests.cs @@ -0,0 +1,61 @@ +using NUnit.Framework; +using System.IO; + +namespace MatFileHandler.Tests +{ + [TestFixture] + public class MatFileReaderHdfTests + { + private const string TestDirectory = "test-data"; + + /// + /// Test reading an ASCII-encoded string. + /// + [Test] + public void TestAscii() + { + var matFile = ReadHdfTestFile("ascii"); + var arrayAscii = matFile["s"].Value as ICharArray; + Assert.That(arrayAscii, Is.Not.Null); + Assert.That(arrayAscii.Dimensions, Is.EqualTo(new[] { 1, 3 })); + Assert.That(arrayAscii.String, Is.EqualTo("abc")); + Assert.That(arrayAscii[2], Is.EqualTo('c')); + } + + /// + /// Test reading a Unicode string. + /// + [Test] + public void TestUnicode() + { + var matFile = ReadHdfTestFile("unicode"); + var arrayUnicode = matFile["s"].Value as ICharArray; + Assert.That(arrayUnicode, Is.Not.Null); + Assert.That(arrayUnicode.Dimensions, Is.EqualTo(new[] { 1, 2 })); + Assert.That(arrayUnicode.String, Is.EqualTo("必フ")); + Assert.That(arrayUnicode[0], Is.EqualTo('必')); + Assert.That(arrayUnicode[1], Is.EqualTo('フ')); + } + + /// + /// Test reading a wide Unicode string. + /// + [Test] + public void TestUnicodeWide() + { + var matFile = ReadHdfTestFile("unicode-wide"); + var arrayUnicodeWide = matFile["s"].Value as ICharArray; + Assert.That(arrayUnicodeWide, Is.Not.Null); + Assert.That(arrayUnicodeWide.Dimensions, Is.EqualTo(new[] { 1, 2 })); + Assert.That(arrayUnicodeWide.String, Is.EqualTo("🍆")); + } + + private static AbstractTestDataFactory GetTests(string factoryName) => + new MatTestDataFactory(Path.Combine(TestDirectory, factoryName)); + + private IMatFile ReadHdfTestFile(string testName) + { + return GetTests("hdf")[testName]; + } + } +} diff --git a/MatFileHandler.Tests/test-data/hdf/ascii.mat b/MatFileHandler.Tests/test-data/hdf/ascii.mat new file mode 100644 index 0000000000000000000000000000000000000000..4bf17a70ac0ba45619520073b17d8ad5156b23a8 GIT binary patch literal 1920 zcmeHHJ5K^Z5S}|wHv|&I##l{h0pt#Rz*U}rfkcqVC7M{^c`Sj1hXL#@^)J}i`cwQ@ zx{p~C(ZbkR$X@ZyW9NA@+w_A|{}>PRMQr*(u0I^{EVf7CqCcIDJ#1Gl8m)u9ELLVb zT<{)FCmsfaJM8dVbc^Vgy<)*Dl(FErC9GFZ_HiB!_&7v2?>Lyp?+%z_H=2fA;oo%N zQT(%1Vke=ZGX`-;>qKk~Z*=ZAKJ%IbxF*6Pd?0)wdkf?Tuubw!^1pm9IIq<}+%)kF zk{OiZnqo$de>3uT72^?L`dMA%q)<$Ztm6lSf9&JwII~!67~+Gz`R}E#B{LGz#ac z^ev&4t77Vp2H{K^B;iPckY@z_VK+KE4lQ4&!36F>KXzF68df93p z?B}o=g**-iIG?%L-f=LG-yJZ=Zuy2> z;oo%NLHx5!YA2-9)D$y<75=LZH0xeR7JNlj)0Dys&6#P&f(SE@O{4fFTENu&GID6auJUBr zT42JPc*}f3%)8Xde6>zzzD9m+a>N8LPeN?q@oVEPOT4swe!RO*gQN5oWQ2x+c&x>{ z-47;wp-SEoTDc-7k6^?jX^@5!2|}I~^rzi;7WW5XFdu|UEAkWJ^@^dA7(b>SpnT{$ Ublc}Ox!F2v08ALo3V+gp55NmrVE_OC literal 0 HcmV?d00001 diff --git a/MatFileHandler.Tests/test-data/hdf/unicode.mat b/MatFileHandler.Tests/test-data/hdf/unicode.mat new file mode 100644 index 0000000000000000000000000000000000000000..e4583d5a7539ecd2cf8b5d3fdcd35e1eb5622140 GIT binary patch literal 1920 zcmeHH%}N4M6h1Q=T|}X5BdyM|g~|An<*b_2P-sk2s6}9xaYVr%80P_kHuV~9^f(eXAF|Cr4z9=ywkbe{K{(%;D!i`@SgCA>@AQVK#t_w-9)D$yzzzCnI%a>N8LPeN?q^WfFV5-)Av9`CNx;3&NX8KGey9&7P# z4}vLQs*-nvR<4N2BN+2Y8l>S^f{ Dimensions.Length == 0; + + public int[] Dimensions { get; } + + public int Count => Dimensions.NumberOfElements(); + + public double[] ConvertToDoubleArray() + { + return Data.Select(Convert.ToDouble).ToArray(); + } + + public Complex[] ConvertToComplexArray() + { + return ConvertToDoubleArray().Select(x => new Complex(x, 0.0)).ToArray(); + } + + public char[] Data => StringData.ToCharArray(); + + public char this[params int[] list] + { + get => StringData[Dimensions.DimFlatten(list)]; + set { + var chars = StringData.ToCharArray(); + chars[Dimensions.DimFlatten(list)] = value; + StringData = chars.ToString(); + } + } + + public string String => StringData; + + private string StringData { get; set; } + } + + internal class HdfFileReader + { + private long fileId; + + private List variables; + + internal HdfFileReader(long fileId) + { + this.fileId = fileId; + } + + internal IMatFile Read() + { + variables = new List(); + H5G.info_t group_info = default(H5G.info_t); + var result = H5G.get_info(fileId, ref group_info); + var numberOfVariables = group_info.nlinks; + + ulong idx = 0; + while (idx < numberOfVariables) + { + H5L.iterate( + fileId, + H5.index_t.NAME, + H5.iter_order_t.NATIVE, + ref idx, + VariableIterator, + IntPtr.Zero); + } + return new MatFile(variables); + } + + private int VariableIterator(long group, IntPtr name, ref H5L.info_t info, IntPtr op_data) + { + var variableName = Marshal.PtrToStringAnsi(name); + var object_info = default(H5O.info_t); + H5O.get_info_by_name(group, variableName, ref object_info); + switch (object_info.type) + { + case H5O.type_t.DATASET: + var datasetId = H5D.open(group, variableName); + var value = ReadDataset(datasetId); + variables.Add(new MatVariable(value, variableName, false)); + break; + case H5O.type_t.GROUP: + throw new NotImplementedException(); + } + return 0; + } + + private static string GetMatlabClassOfDataset(long datasetId) + { + var attributeId = H5A.open_by_name(datasetId, ".", "MATLAB_class"); + + var typeId = H5A.get_type(attributeId); + var cl = H5T.get_class(typeId); + if (cl != H5T.class_t.STRING) + { + throw new NotImplementedException(); + } + var classId = H5T.copy(H5T.C_S1); + var typeIdSize = H5T.get_size(typeId); + H5T.set_size(classId, typeIdSize); + var buf = Marshal.AllocHGlobal(typeIdSize); + H5A.read(attributeId, classId, buf); + var matlabClassNameBytes = new byte[(int)typeIdSize]; + Marshal.Copy(buf, matlabClassNameBytes, 0, (int)typeIdSize); + return Encoding.ASCII.GetString(matlabClassNameBytes); + } + + private static int[] GetDimensionsOfDataset(long datasetId) + { + var spaceId = H5D.get_space(datasetId); + var rank = H5S.get_simple_extent_ndims(spaceId); + var dims = new ulong[rank]; + H5S.get_simple_extent_dims(spaceId, dims, null); + Array.Reverse(dims); + return dims.Select(x => (int)x).ToArray(); + } + + private static IArray ReadDataset(long datasetId) + { + var dims = GetDimensionsOfDataset(datasetId); + + var matlabClass = GetMatlabClassOfDataset(datasetId); + + if (matlabClass == "char") + { + return ReadCharArray(datasetId, dims); + } + throw new NotImplementedException(); + } + + private static IArray ReadCharArray(long datasetId, int[] dims) + { + var storageSize = (int)H5D.get_storage_size(datasetId); + var data = new byte[storageSize]; + var dataBuffer = Marshal.AllocHGlobal(storageSize); + H5D.read(datasetId, H5T.NATIVE_UINT16, H5S.ALL, H5S.ALL, H5P.DEFAULT, dataBuffer); + Marshal.Copy(dataBuffer, data, 0, storageSize); + var str = Encoding.Unicode.GetString(data); + return new HdfCharArray(dims, str); + } + } +} diff --git a/MatFileHandler/Header.cs b/MatFileHandler/Header.cs index 06b68ce..0e7db2b 100755 --- a/MatFileHandler/Header.cs +++ b/MatFileHandler/Header.cs @@ -13,13 +13,16 @@ namespace MatFileHandler /// internal class Header { - private Header(string text, long subsystemDataOffset, int version) + private Header(byte[] rawBytes, string text, long subsystemDataOffset, int version) { + RawBytes = rawBytes; Text = text; SubsystemDataOffset = subsystemDataOffset; Version = version; } + public byte[] RawBytes { get; } + /// /// Gets the header text. /// @@ -55,7 +58,7 @@ namespace MatFileHandler platform = platform.Remove(length); } var text = $"MATLAB 5.0 MAT-file, Platform: {platform}, Created on: {dateTime}{padding}"; - return new Header(text, 0, 256); + return new Header(null, text, 0, 256); } /// @@ -65,18 +68,26 @@ namespace MatFileHandler /// The header read. public static Header Read(BinaryReader reader) { - var textBytes = reader.ReadBytes(116); - var text = System.Text.Encoding.UTF8.GetString(textBytes); - var subsystemDataOffsetBytes = reader.ReadBytes(8); - var subsystemDataOffset = BitConverter.ToInt64(subsystemDataOffsetBytes, 0); - var version = reader.ReadInt16(); - var endian = reader.ReadInt16(); - var isLittleEndian = endian == 19785; - if (!isLittleEndian) + var rawBytes = reader.ReadBytes(128); + using (var stream = new MemoryStream(rawBytes)) { - throw new NotSupportedException("Big-endian files are not supported."); + using (var newReader = new BinaryReader(stream)) + { + var textBytes = newReader.ReadBytes(116); + var text = System.Text.Encoding.UTF8.GetString(textBytes); + var subsystemDataOffsetBytes = newReader.ReadBytes(8); + var subsystemDataOffset = BitConverter.ToInt64(subsystemDataOffsetBytes, 0); + var version = newReader.ReadInt16(); + var endian = newReader.ReadInt16(); + var isLittleEndian = endian == 19785; + if (!isLittleEndian) + { + throw new NotSupportedException("Big-endian files are not supported."); + } + + return new Header(rawBytes, text, subsystemDataOffset, version); + } } - return new Header(text, subsystemDataOffset, version); } private static string GetOperatingSystem() diff --git a/MatFileHandler/MatFileHdfReader.cs b/MatFileHandler/MatFileHdfReader.cs new file mode 100644 index 0000000..285b75b --- /dev/null +++ b/MatFileHandler/MatFileHdfReader.cs @@ -0,0 +1,41 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Runtime.InteropServices; +using System.Text; +using HDF.PInvoke; + +namespace MatFileHandler +{ + internal static class MatFileHdfReader + { + internal static IMatFile ContinueReadingHdfFile(Header header, Stream stream) + { + using (var memoryStream = new MemoryStream()) + { + using (var headerStream = new MemoryStream(header.RawBytes)) + { + headerStream.CopyTo(memoryStream); + } + stream.CopyTo(memoryStream); + var bytes = memoryStream.ToArray(); + return ReadFromByteArray(bytes); + } + } + + private static IMatFile ReadFromByteArray(byte[] bytes) + { + var fileAccessPropertyList = H5P.create(H5P.FILE_ACCESS); + H5P.set_fapl_core(fileAccessPropertyList, IntPtr.Add(IntPtr.Zero, 1024), 0); + var ptr = Marshal.AllocCoTaskMem(bytes.Length); + Marshal.Copy(bytes, 0, ptr, bytes.Length); + H5P.set_file_image(fileAccessPropertyList, ptr, IntPtr.Add(IntPtr.Zero, bytes.Length)); + var fileId = H5F.open(Guid.NewGuid().ToString(), H5F.ACC_RDONLY, fileAccessPropertyList); + var hdfFileReader = new HdfFileReader(fileId); + var result = hdfFileReader.Read(); + H5F.close(fileId); + H5F.clear_elink_file_cache(fileId); + return result; + } + } +} diff --git a/MatFileHandler/MatFileLevel5Reader.cs b/MatFileHandler/MatFileLevel5Reader.cs new file mode 100644 index 0000000..fc7a81e --- /dev/null +++ b/MatFileHandler/MatFileLevel5Reader.cs @@ -0,0 +1,88 @@ +using System.Collections.Generic; +using System.IO; + +namespace MatFileHandler +{ + internal static class MatFileLevel5Reader + { + + /// + /// Read a sequence of raw variables from .mat file. + /// + /// Reader. + /// Offset of subsystem data in the file; + /// we need it because we may encounter it during reading, and + /// the subsystem data should be parsed in a special way. + /// + /// Link to the current file's subsystem data structure; initially it has dummy value + /// which will be replaced after we parse the whole subsystem data. + /// List of "raw" variables; the actual variables are constructed from them later. + internal static List ReadRawVariables(BinaryReader reader, long subsystemDataOffset, SubsystemData subsystemData) + { + var variables = new List(); + var dataElementReader = new DataElementReader(subsystemData); + while (true) + { + try + { + var position = reader.BaseStream.Position; + var dataElement = dataElementReader.Read(reader); + if (position == subsystemDataOffset) + { + var subsystemDataElement = dataElement as IArrayOf; + var newSubsystemData = ReadSubsystemData(subsystemDataElement.Data, subsystemData); + subsystemData.Set(newSubsystemData); + } + else + { + variables.Add(new RawVariable(position, dataElement)); + } + } + catch (EndOfStreamException) + { + break; + } + } + + return variables; + } + + /// + /// Read raw variables from a .mat file. + /// + /// Binary reader. + /// Offset to the subsystem data to use (read from the file header). + /// Raw variables read. + internal static List ReadRawVariables(BinaryReader reader, long subsystemDataOffset) + { + var subsystemData = new SubsystemData(); + return ReadRawVariables(reader, subsystemDataOffset, subsystemData); + } + + internal static IMatFile ContinueReadingLevel5File(Header header, BinaryReader reader) + { + var rawVariables = ReadRawVariables(reader, header.SubsystemDataOffset); + var variables = new List(); + foreach (var variable in rawVariables) + { + var array = variable.DataElement as MatArray; + if (array is null) + { + continue; + } + + variables.Add(new MatVariable( + array, + array.Name, + array.Flags.Variable.HasFlag(Variable.IsGlobal))); + } + + return new MatFile(variables); + } + + private static SubsystemData ReadSubsystemData(byte[] bytes, SubsystemData subsystemData) + { + return SubsystemDataReader.Read(bytes, subsystemData); + } + } +} diff --git a/MatFileHandler/MatFileReader.cs b/MatFileHandler/MatFileReader.cs index 0050272..9a1b77e 100755 --- a/MatFileHandler/MatFileReader.cs +++ b/MatFileHandler/MatFileReader.cs @@ -34,89 +34,23 @@ namespace MatFileHandler } } - /// - /// Read a sequence of raw variables from .mat file. - /// - /// Reader. - /// Offset of subsystem data in the file; - /// we need it because we may encounter it during reading, and - /// the subsystem data should be parsed in a special way. - /// - /// Link to the current file's subsystem data structure; initially it has dummy value - /// which will be replaced after we parse the whole subsystem data. - /// List of "raw" variables; the actual variables are constructed from them later. - internal static List ReadRawVariables(BinaryReader reader, long subsystemDataOffset, SubsystemData subsystemData) - { - var variables = new List(); - var dataElementReader = new DataElementReader(subsystemData); - while (true) - { - try - { - var position = reader.BaseStream.Position; - var dataElement = dataElementReader.Read(reader); - if (position == subsystemDataOffset) - { - var subsystemDataElement = dataElement as IArrayOf; - var newSubsystemData = ReadSubsystemData(subsystemDataElement.Data, subsystemData); - subsystemData.Set(newSubsystemData); - } - else - { - variables.Add(new RawVariable(position, dataElement)); - } - } - catch (EndOfStreamException) - { - break; - } - } - - return variables; - } - - /// - /// Read raw variables from a .mat file. - /// - /// Binary reader. - /// Offset to the subsystem data to use (read from the file header). - /// Raw variables read. - internal static List ReadRawVariables(BinaryReader reader, long subsystemDataOffset) - { - var subsystemData = new SubsystemData(); - return ReadRawVariables(reader, subsystemDataOffset, subsystemData); - } - - private static IMatFile Read(BinaryReader reader) + private IMatFile Read(BinaryReader reader) { var header = ReadHeader(reader); - var rawVariables = ReadRawVariables(reader, header.SubsystemDataOffset); - var variables = new List(); - foreach (var variable in rawVariables) + switch (header.Version) { - var array = variable.DataElement as MatArray; - if (array is null) - { - continue; - } - - variables.Add(new MatVariable( - array, - array.Name, - array.Flags.Variable.HasFlag(Variable.IsGlobal))); + case 256: + return MatFileLevel5Reader.ContinueReadingLevel5File(header, reader); + case 512: + return MatFileHdfReader.ContinueReadingHdfFile(header, reader.BaseStream); + default: + throw new NotSupportedException($"Unknown file format."); } - - return new MatFile(variables); } private static Header ReadHeader(BinaryReader reader) { return Header.Read(reader); } - - private static SubsystemData ReadSubsystemData(byte[] bytes, SubsystemData subsystemData) - { - return SubsystemDataReader.Read(bytes, subsystemData); - } } } \ No newline at end of file diff --git a/MatFileHandler/SubsystemDataReader.cs b/MatFileHandler/SubsystemDataReader.cs index 9b1b6fc..9799549 100644 --- a/MatFileHandler/SubsystemDataReader.cs +++ b/MatFileHandler/SubsystemDataReader.cs @@ -29,7 +29,7 @@ namespace MatFileHandler using (var reader = new BinaryReader(stream)) { reader.ReadBytes(8); - rawVariables = MatFileReader.ReadRawVariables(reader, -1, subsystemData); + rawVariables = MatFileLevel5Reader.ReadRawVariables(reader, -1, subsystemData); } }