Skip to content

Commit d4884c3

Browse files
Feature: Automatically detect zip encoding (#17045)
Signed-off-by: oxygen-dioxide <[email protected]> Co-authored-by: Yair <[email protected]>
1 parent 9fe3a93 commit d4884c3

File tree

8 files changed

+175
-62
lines changed

8 files changed

+175
-62
lines changed

Diff for: Directory.Packages.props

+1
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
<PackageVersion Include="Microsoft.Graphics.Win2D" Version="1.3.2" />
3939
<PackageVersion Include="TagLibSharp" Version="2.3.0" />
4040
<PackageVersion Include="Tulpep.ActiveDirectoryObjectPicker" Version="3.0.11" />
41+
<PackageVersion Include="UTF.Unknown" Version="2.5.1" />
4142
<PackageVersion Include="WinUIEx" Version="2.5.1" />
4243
<PackageVersion Include="Vanara.Windows.Extensions" Version="4.0.1" />
4344
<PackageVersion Include="Vanara.Windows.Shell" Version="4.0.1" />

Diff for: src/Files.App/Actions/Content/Archives/Decompress/DecompressArchive.cs

+7-1
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,11 @@ public override async Task ExecuteAsync(object? parameter = null)
4343

4444
var isArchiveEncrypted = await FilesystemTasks.Wrap(() => StorageArchiveService.IsEncryptedAsync(archive.Path));
4545
var isArchiveEncodingUndetermined = await FilesystemTasks.Wrap(() => StorageArchiveService.IsEncodingUndeterminedAsync(archive.Path));
46+
Encoding? detectedEncoding = null;
47+
if (isArchiveEncodingUndetermined)
48+
{
49+
detectedEncoding = await FilesystemTasks.Wrap(() => StorageArchiveService.DetectEncodingAsync(archive.Path));
50+
}
4651
var password = string.Empty;
4752
Encoding? encoding = null;
4853

@@ -51,7 +56,8 @@ public override async Task ExecuteAsync(object? parameter = null)
5156
{
5257
IsArchiveEncrypted = isArchiveEncrypted,
5358
IsArchiveEncodingUndetermined = isArchiveEncodingUndetermined,
54-
ShowPathSelection = true
59+
ShowPathSelection = true,
60+
DetectedEncoding = detectedEncoding,
5561
};
5662
decompressArchiveDialog.ViewModel = decompressArchiveViewModel;
5763

Diff for: src/Files.App/Data/Contracts/IStorageArchiveService.cs

+8-1
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,17 @@ public interface IStorageArchiveService
5959
/// <summary>
6060
/// Gets the value that indicates whether the archive file's encoding is undetermined.
6161
/// </summary>
62-
/// <param name="archiveFilePath">The archive file path to check if the item is encrypted.</param>
62+
/// <param name="archiveFilePath">The archive file path to check if the encoding is undetermined.</param>
6363
/// <returns>True if the archive file's encoding is undetermined; otherwise, false.</returns>
6464
Task<bool> IsEncodingUndeterminedAsync(string archiveFilePath);
6565

66+
/// <summary>
67+
/// Detect encoding for a zip file whose encoding is undetermined.
68+
/// </summary>
69+
/// <param name="archiveFilePath">The archive file path to detect encoding</param>
70+
/// <returns>Null if the archive file doesn't need to detect encoding or its encoding can't be detected; otherwise, the encoding detected.</returns>
71+
Task<Encoding?> DetectEncodingAsync(string archiveFilePath);
72+
6673
/// <summary>
6774
/// Gets the <see cref="SevenZipExtractor"/> instance from the archive file path.
6875
/// </summary>

Diff for: src/Files.App/Data/Items/EncodingItem.cs

+41-2
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ public sealed class EncodingItem
2222
/// Initializes a new instance of the <see cref="EncodingItem"/> class.
2323
/// </summary>
2424
/// <param name="code">The code of the language.</param>
25-
public EncodingItem(string code)
25+
public EncodingItem(string? code)
2626
{
2727
if (string.IsNullOrEmpty(code))
2828
{
@@ -36,6 +36,45 @@ public EncodingItem(string code)
3636
}
3737
}
3838

39-
public override string ToString() => Name;
39+
public EncodingItem(Encoding encoding, string name)
40+
{
41+
Encoding = encoding;
42+
Name = name;
43+
}
44+
45+
public static EncodingItem[] Defaults = new string?[] {
46+
null,//System Default
47+
"UTF-8",
48+
49+
//All possible Windows system encodings
50+
//reference: https://en.wikipedia.org/wiki/Windows_code_page
51+
//East Asian
52+
"shift_jis", //Japanese
53+
"gb2312", //Simplified Chinese
54+
"big5", //Traditional Chinese
55+
"ks_c_5601-1987", //Korean
56+
57+
//Southeast Asian
58+
"Windows-1258", //Vietnamese
59+
"Windows-874", //Thai
60+
61+
//Middle East
62+
"Windows-1256", //Arabic
63+
"Windows-1255", //Hebrew
64+
"Windows-1254", //Turkish
65+
66+
//European
67+
"Windows-1252", //Western European
68+
"Windows-1250", //Central European
69+
"Windows-1251", //Cyrillic
70+
"Windows-1253", //Greek
71+
"Windows-1257", //Baltic
72+
73+
"macintosh",
74+
}
75+
.Select(x => new EncodingItem(x))
76+
.ToArray();
77+
78+
public override string ToString() => Name;
4079
}
4180
}

Diff for: src/Files.App/Files.App.csproj

+1
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@
9595
<PackageReference Include="Microsoft.Graphics.Win2D" />
9696
<PackageReference Include="TagLibSharp" />
9797
<PackageReference Include="Tulpep.ActiveDirectoryObjectPicker" />
98+
<PackageReference Include="UTF.Unknown" />
9899
<PackageReference Include="WinUIEx" />
99100
<PackageReference Include="Vanara.Windows.Extensions" />
100101
<PackageReference Include="Vanara.Windows.Shell" />

Diff for: src/Files.App/Services/Storage/StorageArchiveService.cs

+84-46
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@
66
using ICSharpCode.SharpZipLib.Zip;
77
using SevenZip;
88
using System.IO;
9-
using System.Linq;
109
using System.Text;
10+
using UtfUnknown;
1111
using Windows.Storage;
1212
using Windows.Win32;
1313

@@ -90,7 +90,8 @@ public async Task<bool> CompressAsync(ICompressArchiveModel compressionModel)
9090
/// <inheritdoc/>
9191
public Task<bool> DecompressAsync(string archiveFilePath, string destinationFolderPath, string password = "", Encoding? encoding = null)
9292
{
93-
if(encoding == null){
93+
if (encoding == null)
94+
{
9495
return DecompressAsyncWithSevenZip(archiveFilePath, destinationFolderPath, password);
9596
}
9697
else
@@ -203,22 +204,22 @@ async Task<bool> DecompressAsyncWithSharpZipLib(string archiveFilePath, string d
203204
string.IsNullOrEmpty(destinationFolderPath))
204205
return false;
205206
using var zipFile = new ZipFile(archiveFilePath, StringCodec.FromEncoding(encoding));
206-
if(zipFile is null)
207+
if (zipFile is null)
207208
return false;
208-
209-
if(!string.IsNullOrEmpty(password))
209+
210+
if (!string.IsNullOrEmpty(password))
210211
zipFile.Password = password;
211212

212213
// Initialize a new in-progress status card
213214
var statusCard = StatusCenterHelper.AddCard_Decompress(
214215
archiveFilePath.CreateEnumerable(),
215216
destinationFolderPath.CreateEnumerable(),
216217
ReturnResult.InProgress);
217-
218+
218219
// Check if the decompress operation canceled
219220
if (statusCard.CancellationToken.IsCancellationRequested)
220221
return false;
221-
222+
222223
StatusCenterItemProgressModel fsProgress = new(
223224
statusCard.ProgressEventSource,
224225
enumerationCompleted: true,
@@ -233,51 +234,52 @@ async Task<bool> DecompressAsyncWithSharpZipLib(string archiveFilePath, string d
233234
{
234235
long processedBytes = 0;
235236
int processedFiles = 0;
236-
237-
foreach (ZipEntry zipEntry in zipFile)
237+
await Task.Run(async () =>
238238
{
239-
if (statusCard.CancellationToken.IsCancellationRequested)
239+
foreach (ZipEntry zipEntry in zipFile)
240240
{
241-
isSuccess = false;
242-
break;
243-
}
244-
245-
if (!zipEntry.IsFile)
246-
{
247-
continue; // Ignore directories
248-
}
241+
if (statusCard.CancellationToken.IsCancellationRequested)
242+
{
243+
isSuccess = false;
244+
break;
245+
}
249246

250-
string entryFileName = zipEntry.Name;
251-
string fullZipToPath = Path.Combine(destinationFolderPath, entryFileName);
252-
string directoryName = Path.GetDirectoryName(fullZipToPath);
247+
if (!zipEntry.IsFile)
248+
{
249+
continue; // Ignore directories
250+
}
253251

254-
if (!Directory.Exists(directoryName))
255-
{
256-
Directory.CreateDirectory(directoryName);
257-
}
252+
string entryFileName = zipEntry.Name;
253+
string fullZipToPath = Path.Combine(destinationFolderPath, entryFileName);
254+
string directoryName = Path.GetDirectoryName(fullZipToPath);
258255

259-
byte[] buffer = new byte[4096]; // 4K is a good default
260-
using (Stream zipStream = zipFile.GetInputStream(zipEntry))
261-
using (FileStream streamWriter = File.Create(fullZipToPath))
262-
{
263-
await ThreadingService.ExecuteOnUiThreadAsync(() =>
256+
if (!Directory.Exists(directoryName))
264257
{
265-
fsProgress.FileName = entryFileName;
266-
fsProgress.Report();
267-
});
258+
Directory.CreateDirectory(directoryName);
259+
}
268260

269-
StreamUtils.Copy(zipStream, streamWriter, buffer);
270-
}
271-
processedBytes += zipEntry.Size;
272-
if (fsProgress.TotalSize > 0)
273-
{
274-
fsProgress.Report(processedBytes / (double)fsProgress.TotalSize * 100);
261+
byte[] buffer = new byte[4096]; // 4K is a good default
262+
using (Stream zipStream = zipFile.GetInputStream(zipEntry))
263+
using (FileStream streamWriter = File.Create(fullZipToPath))
264+
{
265+
await ThreadingService.ExecuteOnUiThreadAsync(() =>
266+
{
267+
fsProgress.FileName = entryFileName;
268+
fsProgress.Report();
269+
});
270+
271+
StreamUtils.Copy(zipStream, streamWriter, buffer);
272+
}
273+
processedBytes += zipEntry.Size;
274+
if (fsProgress.TotalSize > 0)
275+
{
276+
fsProgress.Report(processedBytes / (double)fsProgress.TotalSize * 100);
277+
}
278+
processedFiles++;
279+
fsProgress.AddProcessedItemsCount(1);
280+
fsProgress.Report();
275281
}
276-
processedFiles++;
277-
fsProgress.AddProcessedItemsCount(1);
278-
fsProgress.Report();
279-
}
280-
282+
});
281283
if (!statusCard.CancellationToken.IsCancellationRequested)
282284
{
283285
isSuccess = true;
@@ -321,7 +323,7 @@ await ThreadingService.ExecuteOnUiThreadAsync(() =>
321323
return isSuccess;
322324
}
323325

324-
326+
325327
/// <inheritdoc/>
326328
public string GenerateArchiveNameFromItems(IReadOnlyList<ListedItem> items)
327329
{
@@ -355,7 +357,7 @@ public async Task<bool> IsEncodingUndeterminedAsync(string archiveFilePath)
355357
{
356358
using (ZipFile zipFile = new ZipFile(archiveFilePath))
357359
{
358-
return !zipFile.Cast<ZipEntry>().All(entry=>entry.IsUnicodeText);
360+
return !zipFile.Cast<ZipEntry>().All(entry => entry.IsUnicodeText);
359361
}
360362
}
361363
catch (Exception ex)
@@ -365,6 +367,42 @@ public async Task<bool> IsEncodingUndeterminedAsync(string archiveFilePath)
365367
}
366368
}
367369

370+
public async Task<Encoding?> DetectEncodingAsync(string archiveFilePath)
371+
{
372+
//Temporarily using cp437 to decode zip file
373+
//because SharpZipLib requires an encoding when decoding
374+
//and cp437 contains all bytes as character
375+
//which means that we can store any byte array as cp437 string losslessly
376+
var cp437 = Encoding.GetEncoding(437);
377+
try
378+
{
379+
using (ZipFile zipFile = new ZipFile(archiveFilePath, StringCodec.FromEncoding(cp437)))
380+
{
381+
var fileNameBytes = cp437.GetBytes(
382+
String.Join("\n",
383+
zipFile.Cast<ZipEntry>()
384+
.Where(e => !e.IsUnicodeText)
385+
.Select(e => e.Name)
386+
)
387+
);
388+
var detectionResult = CharsetDetector.DetectFromBytes(fileNameBytes);
389+
if (detectionResult.Detected != null && detectionResult.Detected.Confidence > 0.5)
390+
{
391+
return detectionResult.Detected.Encoding;
392+
}
393+
else
394+
{
395+
return null;
396+
}
397+
}
398+
}
399+
catch (Exception ex)
400+
{
401+
Console.WriteLine($"SharpZipLib error: {ex.Message}");
402+
return null;
403+
}
404+
}
405+
368406
/// <inheritdoc/>
369407
public async Task<SevenZipExtractor?> GetSevenZipExtractorAsync(string archiveFilePath, string password = "")
370408
{

Diff for: src/Files.App/Strings/en-US/Resources.resw

+3
Original file line numberDiff line numberDiff line change
@@ -2099,6 +2099,9 @@
20992099
<data name="Encoding" xml:space="preserve">
21002100
<value>Encoding</value>
21012101
</data>
2102+
<data name="EncodingDetected" xml:space="preserve">
2103+
<value>{0} (detected)</value>
2104+
</data>
21022105
<data name="ExtractToPath" xml:space="preserve">
21032106
<value>Path</value>
21042107
</data>

Diff for: src/Files.App/ViewModels/Dialogs/DecompressArchiveDialogViewModel.cs

+30-12
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,16 @@ public bool IsArchiveEncodingUndetermined
4444
set => SetProperty(ref isArchiveEncodingUndetermined, value);
4545
}
4646

47+
private Encoding? detectedEncoding;
48+
public Encoding? DetectedEncoding
49+
{
50+
get => detectedEncoding;
51+
set {
52+
SetProperty(ref detectedEncoding, value);
53+
RefreshEncodingOptions();
54+
}
55+
}
56+
4757
private bool showPathSelection;
4858
public bool ShowPathSelection
4959
{
@@ -53,19 +63,27 @@ public bool ShowPathSelection
5363

5464
public DisposableArray? Password { get; private set; }
5565

56-
public EncodingItem[] EncodingOptions { get; set; } = new string?[] {
57-
null,//System Default
58-
"UTF-8",
59-
"shift_jis",
60-
"gb2312",
61-
"big5",
62-
"ks_c_5601-1987",
63-
"Windows-1252",
64-
"macintosh",
65-
}
66-
.Select(x=>new EncodingItem(x))
67-
.ToArray();
66+
public EncodingItem[] EncodingOptions { get; set; } = EncodingItem.Defaults;
6867
public EncodingItem SelectedEncoding { get; set; }
68+
void RefreshEncodingOptions()
69+
{
70+
if (detectedEncoding != null)
71+
{
72+
EncodingOptions = EncodingItem.Defaults
73+
.Prepend(new EncodingItem(
74+
detectedEncoding,
75+
string.Format(Strings.EncodingDetected.GetLocalizedResource(), detectedEncoding.EncodingName)
76+
))
77+
.ToArray();
78+
}
79+
else
80+
{
81+
EncodingOptions = EncodingItem.Defaults;
82+
}
83+
SelectedEncoding = EncodingOptions.FirstOrDefault();
84+
}
85+
86+
6987

7088
public IRelayCommand PrimaryButtonClickCommand { get; private set; }
7189

0 commit comments

Comments
 (0)