Mercurial > hg > Members > anatofuz > lectable
changeset 13:989cfda07d71
impl parse from pdf
author | anatofuz <anatofuz@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Wed, 01 Apr 2020 14:08:32 +0900 |
parents | e00b5301263a |
children | bac5eb544d4d |
files | cmd_pdf.go pdf.go |
diffstat | 2 files changed, 218 insertions(+), 4 deletions(-) [+] |
line wrap: on
line diff
--- a/cmd_pdf.go Wed Apr 01 09:31:19 2020 +0900 +++ b/cmd_pdf.go Wed Apr 01 14:08:32 2020 +0900 @@ -17,7 +17,20 @@ } func (cd *cmdPDF) run(ctx context.Context, argv []string, outStream, errStream io.Writer) error { - h, _ := convertStringFromPDF(&argv) - fmt.Println(h) + ppsr, _ := convertStringFromPDF(&argv) + var fpfs []*lectureFPDF + for _, pps := range *ppsr { + for _, pp := range pps { + fpdf, err := str2lectureFPDF(pp) + if err != nil { + return err + } + fpfs = append(fpfs, fpdf...) + } + } + + for _, h := range fpfs { + fmt.Println(*h) + } return nil }
--- a/pdf.go Wed Apr 01 09:31:19 2020 +0900 +++ b/pdf.go Wed Apr 01 14:08:32 2020 +0900 @@ -2,8 +2,11 @@ import ( "bytes" + "fmt" "regexp" + "strconv" "strings" + "unicode" "github.com/ledongthuc/pdf" "github.com/pkg/errors" @@ -16,10 +19,9 @@ isSelect bool `json:isSelect` day string `json:day` place string `json:place` + grades []int `json:grades` } -var idRgex = regexp.MustCompile(`\d{9}`) - type pdfPagesStr []string func convertStringFromPDF(pdfPaths *[]string) (*[]pdfPagesStr, error) { @@ -46,3 +48,202 @@ } return &ppsr, nil } + +func isIE(s *string) bool { + if strings.HasPrefix(*s, "情") { // === '^情' ( 情01) + if strings.Contains(*s, "情報科学演習") { + return false + } + if !strings.Contains(*s, "日本語表現") { + return true + } + } + + if strings.HasPrefix(*s, "知能") { // === '^知能' (知能02) + return true + } + + if strings.Contains(*s, "情報工学") { + return true + } + + if strings.Contains(*s, "知能情報") { + return true + } + + if strings.Contains(*s, "電子情報通信") || strings.Contains(*s, "エネルギー環境") || + strings.Contains(*s, "機械工学") || strings.Contains(*s, "社会基盤デザイン") || strings.Contains(*s, "電気システム") { + return false + } + + if strings.Contains(*s, "GEプログラム") { + return true + } + + if strings.Contains(*s, "キャリアデザイン入門") { + return true + } + + if strings.Contains(*s, "情報科教育法") { + return true + } + + if strings.Contains(*s, "産業社会学") { + return true + } + return false +} + +var days = []string{"月", "火", "水", "木", "金"} +var kanjiDayLen = len("月") + +func getPlace(s string) (string, int, error) { + + var lastIndex int + if lastIndex = strings.Index(s, "半年"); lastIndex == -1 { + if lastIndex = strings.Index(s, "第"); lastIndex == -1 { + return "", 0, errors.New("failed found place") + } + } + + var startDay int + for _, day := range days { + if startDay = strings.Index(s, day); startDay != -1 { + if _, err := strconv.Atoi(s[startDay : startDay+kanjiDayLen]); err != nil { + startDay = strings.LastIndex(s, day) + } + if startDay < lastIndex { + fmt.Println(s[startDay:lastIndex]) + break + } + } + } + + distanceDayToPlace := 0 + for i, w := range s[startDay+kanjiDayLen : lastIndex] { // +1 is skip day kanji chara (ex. "月") + if unicode.Is(unicode.Han, w) { + distanceDayToPlace = i + break + } + } + + return s[startDay+kanjiDayLen+distanceDayToPlace : lastIndex], startDay, nil + +} + +var commaLen = len("・") + +func getGrades(s string, dayKanjiIndex int) ([]int, error) { + comma := s[dayKanjiIndex-commaLen-1 : dayKanjiIndex-1] + var grade []int + if comma == "・" { + prevgrade := s[dayKanjiIndex-commaLen-2 : dayKanjiIndex-commaLen-1] + pg, err := strconv.Atoi(prevgrade) + if err != nil { + return nil, err + } + grade = append(grade, pg) + } + gr, err := strconv.Atoi(s[dayKanjiIndex-1 : dayKanjiIndex]) + if err != nil { + return nil, err + } + grade = append(grade, gr) + return grade, nil +} + +var idRgex = regexp.MustCompile(`\d{9}`) + +func str2lectureFPDF(s string) ([]*lectureFPDF, error) { + is := idRgex.FindAllStringIndex(s, -1) + if is == nil { + return nil, nil + } + maxIndex := len(is) - 1 + + var lecturefpdfs []*lectureFPDF + + for i, h := range is { + var lecturefpd lectureFPDF + lecturefpd.id = s[h[0]:h[1]] + + if i != maxIndex { + lecturefpd.body = s[h[1]:is[i+1][0]] + } else { + lecturefpd.body = s[h[1]:] + } + lecturefpd.isSelect = strings.Contains(lecturefpd.body, "選") + + if strings.Contains(lecturefpd.body, "集中") || strings.Contains(lecturefpd.body, "通年") { + continue + } + + if strings.Contains(lecturefpd.body, "セミナー") || strings.Contains(lecturefpd.body, "卒業研究") { + continue + } + + if !isIE(&lecturefpd.body) { + continue + } + + if strings.Contains(lecturefpd.body, "大学英語") && !strings.Contains(lecturefpd.body, "大学英語を") { + _, err := parseUniversityEnglish(lecturefpd.body) + if err != nil { + return nil, errors.Wrap(err, "failed parse univ english") + } + continue + } + + lplace, indexG, err := getPlace(lecturefpd.body) + + if err != nil { + return nil, err + } + if indexG == -1 { + fmt.Printf("wan!!! %s not parsed ", lecturefpd.body) + continue + } + + lecturefpd.place = lplace + + grades, err := getGrades(lecturefpd.body, indexG) + if err != nil { + return nil, err + } + lecturefpd.grades = grades + + lecturefpdfs = append(lecturefpdfs, &lecturefpd) + } + return lecturefpdfs, nil +} + +func parseUniversityEnglish(s string) ([]*lectureFPDF, error) { + /* + 外101大学英語42―2非富里 明美8421~4月木2共3-205(月)共3-403(木)半年知能情報(42) + 外101大学英語42―2非宮城 和文9401~4月木2共3-403(月)共3-201(木)半年知能情報(20) + */ + + type uestruct struct { + index int + kanji string + } + + var uess []uestruct + + for _, day := range days { + if startDay := strings.Index(s, day); startDay != -1 { + ues := uestruct{ + index: startDay, + kanji: day, + } + uess = append(uess, ues) + } + } + + _, err := strconv.Atoi(s[uess[1].index+kanjiDayLen : uess[1].index+kanjiDayLen+1]) + if err != nil { + return nil, errors.Wrap(err, "failed parse int at univ english") + } + + return nil, nil +}