From c8f581d2f8cb815e957eb577b7a6a58d7222043e Mon Sep 17 00:00:00 2001 From: angeousta <132761637+angeousta@users.noreply.github.com> Date: Thu, 22 Aug 2024 21:03:11 +0200 Subject: [PATCH] Refactor (#175) * Update Parser.cs, UriExtensions.cs, and Scraper.cs * Update Constants.cs, Ranking.cs, RankingSummaryStudent.cs, and 4 more files * revert: field order in Ranking class --------- Co-authored-by: userbot_github Co-authored-by: Lorenzo --- .../Data/Constants.cs | 2 +- .../Objects/RankingNS/Ranking.cs | 2 +- .../Utils/Output/RankingSummaryStudent.cs | 18 ++++---- .../Utils/Transformer/ParserNS/Parser.cs | 2 +- .../Main/Program.cs | 2 +- .../Utils/UriExtensions.cs | 15 +++--- .../Utils/Web/Scraper.cs | 46 ++++++++++--------- .../Utils/Web/ScraperOutput.cs | 7 ++- 8 files changed, 48 insertions(+), 46 deletions(-) diff --git a/PoliNetwork.Graduatorie.Common/Data/Constants.cs b/PoliNetwork.Graduatorie.Common/Data/Constants.cs index 5919cc2e..e2f9f5a9 100644 --- a/PoliNetwork.Graduatorie.Common/Data/Constants.cs +++ b/PoliNetwork.Graduatorie.Common/Data/Constants.cs @@ -11,4 +11,4 @@ public static class Constants public const string OutputManifestiFilename = "manifesti.json"; public const string DataFolder = "data"; public const string LocationPlaceholder = "0"; -} +} \ No newline at end of file diff --git a/PoliNetwork.Graduatorie.Parser/Objects/RankingNS/Ranking.cs b/PoliNetwork.Graduatorie.Parser/Objects/RankingNS/Ranking.cs index a4812f52..a9aae6cd 100644 --- a/PoliNetwork.Graduatorie.Parser/Objects/RankingNS/Ranking.cs +++ b/PoliNetwork.Graduatorie.Parser/Objects/RankingNS/Ranking.cs @@ -180,4 +180,4 @@ public override int GetHashCode() { return GetHashWithoutLastUpdate(); } -} \ No newline at end of file +} diff --git a/PoliNetwork.Graduatorie.Parser/Utils/Output/RankingSummaryStudent.cs b/PoliNetwork.Graduatorie.Parser/Utils/Output/RankingSummaryStudent.cs index a9ef094c..cb5c8727 100644 --- a/PoliNetwork.Graduatorie.Parser/Utils/Output/RankingSummaryStudent.cs +++ b/PoliNetwork.Graduatorie.Parser/Utils/Output/RankingSummaryStudent.cs @@ -40,19 +40,11 @@ public RankingSummaryStudent(string? phase, SchoolEnum? school, int? year, Ranki Url = url; } - public bool Equals(RankingSummaryStudent? other) - { - if (ReferenceEquals(null, other)) return false; - if (ReferenceEquals(this, other)) return true; - return Course == other.Course && Phase == other.Phase && School == other.School && Equals(Url, other.Url) && - Year == other.Year; - } - public int CompareTo(RankingSummaryStudent? other) { if (ReferenceEquals(this, other)) return 0; if (ReferenceEquals(null, other)) return 1; - + var i = (Year ?? 0) - (other.Year ?? 0); if (i != 0) return i < 0 ? -1 : 1; @@ -72,6 +64,14 @@ public int CompareTo(RankingSummaryStudent? other) return i; } + public bool Equals(RankingSummaryStudent? other) + { + if (ReferenceEquals(null, other)) return false; + if (ReferenceEquals(this, other)) return true; + return Course == other.Course && Phase == other.Phase && School == other.School && Equals(Url, other.Url) && + Year == other.Year; + } + public override bool Equals(object? obj) { if (obj is not RankingSummaryStudent rankingSummaryStudent) return false; diff --git a/PoliNetwork.Graduatorie.Parser/Utils/Transformer/ParserNS/Parser.cs b/PoliNetwork.Graduatorie.Parser/Utils/Transformer/ParserNS/Parser.cs index b3e8430d..50f0d3b7 100644 --- a/PoliNetwork.Graduatorie.Parser/Utils/Transformer/ParserNS/Parser.cs +++ b/PoliNetwork.Graduatorie.Parser/Utils/Transformer/ParserNS/Parser.cs @@ -238,7 +238,7 @@ private RankingsSet ParseNewRankings(IReadOnlyCollection htmls) RankingOrder = rankingOrder, RankingSummary = new RankingSummary(), ByMerit = new MeritTable(), - ByCourse = new List(), + ByCourse = new List() }; } diff --git a/PoliNetwork.Graduatorie.Scraper/Main/Program.cs b/PoliNetwork.Graduatorie.Scraper/Main/Program.cs index 575c363c..ac083558 100644 --- a/PoliNetwork.Graduatorie.Scraper/Main/Program.cs +++ b/PoliNetwork.Graduatorie.Scraper/Main/Program.cs @@ -27,7 +27,7 @@ public static List RankingsUrls(Metrics mt, ArgsConfig argsConfig) var rankingsUrls = mt.Execute(LinksFind.GetAll).ToList(); rankingsUrls = ScraperOutput.GetWithUrlsFromLocalFileLinks(rankingsUrls, argsConfig.DataFolder); - var scraper = new Scraper.Utils.Web.Scraper(); + var scraper = new Utils.Web.Scraper(); var manifesti = mt.Execute(scraper.ScrapeManifesti); PrintLinks(rankingsUrls); diff --git a/PoliNetwork.Graduatorie.Scraper/Utils/UriExtensions.cs b/PoliNetwork.Graduatorie.Scraper/Utils/UriExtensions.cs index f0b27f7d..5bddd82b 100644 --- a/PoliNetwork.Graduatorie.Scraper/Utils/UriExtensions.cs +++ b/PoliNetwork.Graduatorie.Scraper/Utils/UriExtensions.cs @@ -1,21 +1,20 @@ -namespace PoliNetwork.Graduatorie.Scraper.Utils; +using System.Web; -using System; -using System.Collections.Specialized; -using System.Web; // For this you need to reference System.Web assembly from the GAC +namespace PoliNetwork.Graduatorie.Scraper.Utils; +// For this you need to reference System.Web assembly from the GAC public static class UriExtensions { public static Uri SetQueryVal(this Uri uri, string name, object value) { - NameValueCollection nvc = HttpUtility.ParseQueryString(uri.Query); + var nvc = HttpUtility.ParseQueryString(uri.Query); nvc[name] = value.ToString(); - return new UriBuilder(uri) {Query = nvc.ToString()}.Uri; + return new UriBuilder(uri) { Query = nvc.ToString() }.Uri; } - + public static Uri RemoveQueryVal(this Uri uri, string name) { - NameValueCollection nvc = HttpUtility.ParseQueryString(uri.Query); + var nvc = HttpUtility.ParseQueryString(uri.Query); nvc.Remove(name); return new UriBuilder(uri) { Query = nvc.ToString() }.Uri; } diff --git a/PoliNetwork.Graduatorie.Scraper/Utils/Web/Scraper.cs b/PoliNetwork.Graduatorie.Scraper/Utils/Web/Scraper.cs index d1fb8a75..162ce57c 100644 --- a/PoliNetwork.Graduatorie.Scraper/Utils/Web/Scraper.cs +++ b/PoliNetwork.Graduatorie.Scraper/Utils/Web/Scraper.cs @@ -15,8 +15,15 @@ public class Scraper private const string BaseUrl = "https://www.polimi.it"; private const string AvvisiFuturiStudentiUrl = "https://www.polimi.it/futuri-studenti/avvisi"; + private static readonly HttpClientHandler HttpClientHandler = new() + { + AllowAutoRedirect = false + }; + private readonly HashSet _alreadyVisited = new(); + private readonly HttpClient _httpClient = new(HttpClientHandler); + private readonly string[] _newsTesters = { "graduatorie", "graduatoria", "punteggi", "tol", @@ -26,13 +33,6 @@ public class Scraper private readonly HtmlWeb _web = new(); - private static readonly HttpClientHandler HttpClientHandler = new() - { - AllowAutoRedirect = false - }; - - private readonly HttpClient _httpClient = new(HttpClientHandler); - public IEnumerable GetRankingsLinks() { // before there were multiple source to get links. @@ -76,11 +76,12 @@ public SortedDictionary>>(); - var designUrl = "https://polimi.it/formazione/corsi-di-laurea/dettaglio-corso/design-degli-interni"; - var ingCivileUrl = + const string designUrl = "https://polimi.it/formazione/corsi-di-laurea/dettaglio-corso/design-degli-interni"; + const string ingCivileUrl = "https://polimi.it/formazione/corsi-di-laurea/dettaglio-corso/ingegneria-per-lambiente-e-il-territorio"; - var ingUrl = "https://polimi.it/formazione/corsi-di-laurea/dettaglio-corso/ingegneria-informatica"; - var archUrbUrl = "https://polimi.it/formazione/corsi-di-laurea/dettaglio-corso/ingegneria-edile-architettura"; + const string ingUrl = "https://polimi.it/formazione/corsi-di-laurea/dettaglio-corso/ingegneria-informatica"; + const string archUrbUrl = + "https://polimi.it/formazione/corsi-di-laurea/dettaglio-corso/ingegneria-edile-architettura"; string[] urls = { designUrl, ingCivileUrl, ingUrl, archUrbUrl }; @@ -112,7 +113,8 @@ public SortedDictionary>()); var groupMap = map[cleanName]; if (groupMap == null) throw new UnreachableException(); @@ -127,14 +129,14 @@ public SortedDictionary()); var courseDict = groupMap[courseName]; var optionLink = new Uri(finalLink.AbsoluteUri).SetQueryVal("k_corso_la", intValue.ToString()); @@ -145,15 +147,17 @@ public SortedDictionary rankingsUrls, string? dataFolder) } public static void WriteManifesti( - SortedDictionary>> manifesti, string? dataFolder) + SortedDictionary>> manifesti, + string? dataFolder) { if (string.IsNullOrEmpty(dataFolder)) return; @@ -74,10 +75,9 @@ public static void WriteManifesti( var jsonString = JsonConvert.SerializeObject(manifesti, Culture.JsonSerializerSettings); var count = manifesti.Sum(a => a.Value.Sum(b => b.Value.Count)); - + Console.WriteLine($"[INFO] ScraperOutput writing to file {filePath}: {count} manifesti"); File.WriteAllText(filePath, jsonString); - } private static string GetOutputLinksString(IEnumerable rankingsUrls) @@ -109,7 +109,6 @@ private static string GetLinksFilePath(string dataFolder) private static string GetManifestiFilePath(string dataFolder) { - return Path.Join(dataFolder, Constants.OutputFolder, Constants.OutputManifestiFilename); } } \ No newline at end of file