Skip to content

Commit

Permalink
Refactor (#175)
Browse files Browse the repository at this point in the history
* Update Parser.cs, UriExtensions.cs, and Scraper.cs

* Update Constants.cs, Ranking.cs, RankingSummaryStudent.cs, and 4 more files

* revert: field order in Ranking class

---------

Co-authored-by: userbot_github <[email protected]>
Co-authored-by: Lorenzo <[email protected]>
  • Loading branch information
3 people authored Aug 22, 2024
1 parent 1926806 commit c8f581d
Show file tree
Hide file tree
Showing 8 changed files with 48 additions and 46 deletions.
2 changes: 1 addition & 1 deletion PoliNetwork.Graduatorie.Common/Data/Constants.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ public static class Constants
public const string OutputManifestiFilename = "manifesti.json";
public const string DataFolder = "data";
public const string LocationPlaceholder = "0";
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -180,4 +180,4 @@ public override int GetHashCode()
{
return GetHashWithoutLastUpdate();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -40,19 +40,11 @@ public RankingSummaryStudent(string? phase, SchoolEnum? school, int? year, Ranki
Url = url;
}

public bool Equals(RankingSummaryStudent? other)
{
if (ReferenceEquals(null, other)) return false;
if (ReferenceEquals(this, other)) return true;
return Course == other.Course && Phase == other.Phase && School == other.School && Equals(Url, other.Url) &&
Year == other.Year;
}

public int CompareTo(RankingSummaryStudent? other)
{
if (ReferenceEquals(this, other)) return 0;
if (ReferenceEquals(null, other)) return 1;

var i = (Year ?? 0) - (other.Year ?? 0);
if (i != 0) return i < 0 ? -1 : 1;

Expand All @@ -72,6 +64,14 @@ public int CompareTo(RankingSummaryStudent? other)
return i;
}

public bool Equals(RankingSummaryStudent? other)
{
if (ReferenceEquals(null, other)) return false;
if (ReferenceEquals(this, other)) return true;
return Course == other.Course && Phase == other.Phase && School == other.School && Equals(Url, other.Url) &&
Year == other.Year;
}

public override bool Equals(object? obj)
{
if (obj is not RankingSummaryStudent rankingSummaryStudent) return false;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ private RankingsSet ParseNewRankings(IReadOnlyCollection<HtmlPage> htmls)
RankingOrder = rankingOrder,
RankingSummary = new RankingSummary(),
ByMerit = new MeritTable(),
ByCourse = new List<CourseTable>(),
ByCourse = new List<CourseTable>()
};
}

Expand Down
2 changes: 1 addition & 1 deletion PoliNetwork.Graduatorie.Scraper/Main/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ public static List<RankingUrl> RankingsUrls(Metrics mt, ArgsConfig argsConfig)
var rankingsUrls = mt.Execute(LinksFind.GetAll).ToList();
rankingsUrls = ScraperOutput.GetWithUrlsFromLocalFileLinks(rankingsUrls, argsConfig.DataFolder);

var scraper = new Scraper.Utils.Web.Scraper();
var scraper = new Utils.Web.Scraper();
var manifesti = mt.Execute(scraper.ScrapeManifesti);

PrintLinks(rankingsUrls);
Expand Down
15 changes: 7 additions & 8 deletions PoliNetwork.Graduatorie.Scraper/Utils/UriExtensions.cs
Original file line number Diff line number Diff line change
@@ -1,21 +1,20 @@
namespace PoliNetwork.Graduatorie.Scraper.Utils;
using System.Web;

using System;
using System.Collections.Specialized;
using System.Web; // For this you need to reference System.Web assembly from the GAC
namespace PoliNetwork.Graduatorie.Scraper.Utils;
// For this you need to reference System.Web assembly from the GAC

public static class UriExtensions
{
public static Uri SetQueryVal(this Uri uri, string name, object value)
{
NameValueCollection nvc = HttpUtility.ParseQueryString(uri.Query);
var nvc = HttpUtility.ParseQueryString(uri.Query);
nvc[name] = value.ToString();
return new UriBuilder(uri) {Query = nvc.ToString()}.Uri;
return new UriBuilder(uri) { Query = nvc.ToString() }.Uri;
}

public static Uri RemoveQueryVal(this Uri uri, string name)
{
NameValueCollection nvc = HttpUtility.ParseQueryString(uri.Query);
var nvc = HttpUtility.ParseQueryString(uri.Query);
nvc.Remove(name);
return new UriBuilder(uri) { Query = nvc.ToString() }.Uri;
}
Expand Down
46 changes: 25 additions & 21 deletions PoliNetwork.Graduatorie.Scraper/Utils/Web/Scraper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,15 @@ public class Scraper
private const string BaseUrl = "https://www.polimi.it";
private const string AvvisiFuturiStudentiUrl = "https://www.polimi.it/futuri-studenti/avvisi";

private static readonly HttpClientHandler HttpClientHandler = new()
{
AllowAutoRedirect = false
};

private readonly HashSet<string> _alreadyVisited = new();

private readonly HttpClient _httpClient = new(HttpClientHandler);

private readonly string[] _newsTesters =
{
"graduatorie", "graduatoria", "punteggi", "tol",
Expand All @@ -26,13 +33,6 @@ public class Scraper

private readonly HtmlWeb _web = new();

private static readonly HttpClientHandler HttpClientHandler = new()
{
AllowAutoRedirect = false
};

private readonly HttpClient _httpClient = new(HttpClientHandler);

public IEnumerable<string> GetRankingsLinks()
{
// before there were multiple source to get links.
Expand Down Expand Up @@ -76,11 +76,12 @@ public SortedDictionary<string, SortedDictionary<string, SortedDictionary<string
{
var map = new SortedDictionary<string, SortedDictionary<string, SortedDictionary<string, string>>>();

var designUrl = "https://polimi.it/formazione/corsi-di-laurea/dettaglio-corso/design-degli-interni";
var ingCivileUrl =
const string designUrl = "https://polimi.it/formazione/corsi-di-laurea/dettaglio-corso/design-degli-interni";
const string ingCivileUrl =
"https://polimi.it/formazione/corsi-di-laurea/dettaglio-corso/ingegneria-per-lambiente-e-il-territorio";
var ingUrl = "https://polimi.it/formazione/corsi-di-laurea/dettaglio-corso/ingegneria-informatica";
var archUrbUrl = "https://polimi.it/formazione/corsi-di-laurea/dettaglio-corso/ingegneria-edile-architettura";
const string ingUrl = "https://polimi.it/formazione/corsi-di-laurea/dettaglio-corso/ingegneria-informatica";
const string archUrbUrl =
"https://polimi.it/formazione/corsi-di-laurea/dettaglio-corso/ingegneria-edile-architettura";

string[] urls = { designUrl, ingCivileUrl, ingUrl, archUrbUrl };

Expand Down Expand Up @@ -112,7 +113,8 @@ public SortedDictionary<string, SortedDictionary<string, SortedDictionary<string

var cleanName = name.Split(" -").FirstOrDefault(name);

if (!map.ContainsKey(cleanName)) map.Add(cleanName, new());
if (!map.ContainsKey(cleanName))
map.Add(cleanName, new SortedDictionary<string, SortedDictionary<string, string>>());
var groupMap = map[cleanName];
if (groupMap == null) throw new UnreachableException();

Expand All @@ -127,14 +129,14 @@ public SortedDictionary<string, SortedDictionary<string, SortedDictionary<string

var value = option.GetAttributeValue("value", "0");
var courseName = option.InnerText.Split(" (").First();

int intValue;
bool isNumber = int.TryParse(value, out intValue);

var isNumber = int.TryParse(value, out var intValue);

if (!isNumber) continue;
if (intValue == 0) continue;

if (!groupMap.ContainsKey(courseName)) groupMap.Add(courseName, new());

if (!groupMap.ContainsKey(courseName))
groupMap.Add(courseName, new SortedDictionary<string, string>());
var courseDict = groupMap[courseName];

var optionLink = new Uri(finalLink.AbsoluteUri).SetQueryVal("k_corso_la", intValue.ToString());
Expand All @@ -145,15 +147,17 @@ public SortedDictionary<string, SortedDictionary<string, SortedDictionary<string
"//td[contains(@class, 'CenterBar')]/table[contains(@class, 'BoxInfoCard')]//tr[4]/td[4]");

string[] defaultLocation = { "DEFAULT" };
var courseLocations = (courseLocationTd == null || courseLocationTd.Count == 0)
? defaultLocation
var courseLocations = courseLocationTd == null || courseLocationTd.Count == 0
? defaultLocation
: courseLocationTd.First().InnerText.Replace("\t", "").Replace("\n", "").Split(",");

foreach (var courseLocation in courseLocations)
{
var cleanCourseLocation = courseLocation.Trim();
var manifestoLink = new Uri(optionLink.AbsoluteUri).RemoveQueryVal("__pj0").RemoveQueryVal("__pj1");
if(!courseDict.ContainsKey(cleanCourseLocation)) courseDict.Add(cleanCourseLocation, manifestoLink.AbsoluteUri);
var manifestoLink = new Uri(optionLink.AbsoluteUri).RemoveQueryVal("__pj0")
.RemoveQueryVal("__pj1");
if (!courseDict.ContainsKey(cleanCourseLocation))
courseDict.Add(cleanCourseLocation, manifestoLink.AbsoluteUri);
}
}
}
Expand Down
7 changes: 3 additions & 4 deletions PoliNetwork.Graduatorie.Scraper/Utils/Web/ScraperOutput.cs
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ public static void WriteLinks(List<RankingUrl> rankingsUrls, string? dataFolder)
}

public static void WriteManifesti(
SortedDictionary<string, SortedDictionary<string, SortedDictionary<string, string>>> manifesti, string? dataFolder)
SortedDictionary<string, SortedDictionary<string, SortedDictionary<string, string>>> manifesti,
string? dataFolder)
{
if (string.IsNullOrEmpty(dataFolder))
return;
Expand All @@ -74,10 +75,9 @@ public static void WriteManifesti(
var jsonString = JsonConvert.SerializeObject(manifesti, Culture.JsonSerializerSettings);

var count = manifesti.Sum(a => a.Value.Sum(b => b.Value.Count));

Console.WriteLine($"[INFO] ScraperOutput writing to file {filePath}: {count} manifesti");
File.WriteAllText(filePath, jsonString);

}

private static string GetOutputLinksString(IEnumerable<RankingUrl> rankingsUrls)
Expand Down Expand Up @@ -109,7 +109,6 @@ private static string GetLinksFilePath(string dataFolder)

private static string GetManifestiFilePath(string dataFolder)
{

return Path.Join(dataFolder, Constants.OutputFolder, Constants.OutputManifestiFilename);
}
}

0 comments on commit c8f581d

Please sign in to comment.